Source code for fastgedcom.parser

"""Functions to parse gedcom files into :py:class:`.Document`.

On module import, register the ansel and gedcom codecs from the `ansel python library
<https://pypi.org/project/ansel/>`_.
"""

from typing import Iterable
from dataclasses import dataclass
from pathlib import Path

from .base import Document, TrueLine, XRef

try:
    import ansel  # type: ignore
except ImportError:

[docs]
    IS_ANSEL_INSTALLED = False

else:
    IS_ANSEL_INSTALLED = True
    ansel.register()



[docs]
class ParsingWarning():
    """Base warning class."""



@dataclass

[docs]
class LineParsingWarning(ParsingWarning):
    """Warn about a line with a single word.
    There should be at least a line level and a tag."""

[docs]
    line_number: int


[docs]
    line_content: str




@dataclass

[docs]
class DuplicateXRefWarning(ParsingWarning):
    """Warn about a cross-reference identifier that is defined twice."""

[docs]
    xref: XRef




@dataclass

[docs]
class LevelInconsistencyWarning(ParsingWarning):
    """Warn about a line without correct parent line."""

[docs]
    line_number: int


[docs]
    line_content: str




@dataclass

[docs]
class LevelParsingWarning(ParsingWarning):
    """Warn about an unparsable line level. Failed to parse it to an integer."""

[docs]
    line_number: int


[docs]
    line_content: str




@dataclass

[docs]
class EmptyLineWarning(ParsingWarning):
    """Warn about an empty line."""

[docs]
    line_number: int




@dataclass

[docs]
class CharacterInsteadOfLineWarning(ParsingWarning):
    """Warn about the presents of a 1-character-long line.
    This happens when the object parsed is an iterable on characters,
    whereas an iterable on lines is expected."""

[docs]
    line_number: int





[docs]
def parse(lines: Iterable[str]) -> tuple[Document, list[ParsingWarning]]:
    """Parse the text input to create a
    :py:class:`.Document` object.

    When a malformed line is encountered, a warning is created
    and we pass continue with the next line.
    Only :py:class:`.CharacterInsteadOfLineWarning` stops the parsing. If
    other warnings occur, the parsing continues with the next line.
    For :py:class:`.LevelInconsistencyWarning`, the line is still inserted in the
    tree.

    Return the :py:class:`.Document` and the list of :py:class:`.ParsingWarning`
    encountered.
    """
    document = Document()
    warnings: list[ParsingWarning] = []
    line_number = 0
    parent_lines: list[TrueLine] = []
    for line in lines:
        line_number += 1
        line_info = line.rstrip().split(' ', 2)
        try:
            if len(line_info) == 3:
                parsed_line = TrueLine(int(line_info[0]), line_info[1], line_info[2], [])
            elif len(line_info) == 2:
                parsed_line = TrueLine(int(line_info[0]), line_info[1], "", [])
            elif line_info == [""]:
                warnings.append(EmptyLineWarning(line_number))
                continue
            else:
                if len(line) == 1:
                    warnings.append(CharacterInsteadOfLineWarning(line_number))
                    break
                warnings.append(LineParsingWarning(line_number, line))
                continue
        except ValueError:
            warnings.append(LevelParsingWarning(line_number, line))
            continue
        if parsed_line.level == 0:
            parent_lines = [parsed_line]
            if parsed_line.tag in document.records:
                warnings.append(DuplicateXRefWarning(parsed_line.tag))
            document.records[parsed_line.tag] = parsed_line
        else:
            while parent_lines and parsed_line.level <= parent_lines[-1].level:
                parent_lines.pop(-1)
            if len(parent_lines) == 0:
                warnings.append(LevelInconsistencyWarning(line_number, line))
            else:
                if (parent_lines[-1].level + 1 != parsed_line.level):
                    warnings.append(LevelInconsistencyWarning(line_number, line))
                parent_lines[-1].sub_lines.append(parsed_line)
                parent_lines.append(parsed_line)
    return (document, warnings)




[docs]
def guess_encoding(file: str | Path) -> str | None:
    """Return the guessed encoding of the ``file``. None if unknown.

    A gedcom should precise its encoding in the header under the tag CHAR.

    However, indication of that field are often misleading or incomplete.
    For example:
    - ANSEL refers to the gedcom version of the ansel charset.
    - The use of a BOM mark is recommended but not stated,
    and not automatically handled by Python.
    - UNICODE refers to UTF-16.
    """
    # check BOM mark to deduce UTF family encodings
    # see http://unicode.org/faq/utf_bom.html#bom4
    with open(file, "rb") as f:
        first_bytes = f.read(4)
    if first_bytes[:3] == b"\xef\xbb\xbf":
        # UTF-8
        # The presence of the BOM mark must be specified.
        # With "utf-8-sig, Python removes the BOM mark when reading the file.
        return "utf-8-sig"
    if first_bytes == b"\xff\xfe\x00\x00":
        # UTF-32, little-endian
        # With "utf_32", Python removes the BOM mark when reading the file.
        return "utf_32"
    if first_bytes == b"\x00\x00\xfe\xff":
        # UTF-32, big-endian
        # With "utf_32", Python removes the BOM mark when reading the file.
        return "utf_32"
    if first_bytes[:2] == b"\xff\xfe":
        # UTF-16, little-endian
        # With "utf_16", Python removes the BOM mark when reading the file.
        return "utf_16"
    if first_bytes[:2] == b"\xfe\xff":
        # UTF-16, big-endian
        # With "utf_16", Python removes the BOM mark when reading the file.
        return "utf_16"
    # Try non-utf encodings and loog at the 0 HEAD > 1 CHAR gedcom field
    encodings = (
        "utf-8",
        "ansel" if IS_ANSEL_INSTALLED else None,
        "iso8859-1",
    )
    for encoding in encodings:
        if encoding is None:
            continue
        try:
            with open(file, "r", encoding=encoding) as f:
                for line in f:
                    if line.startswith("1 CHAR "):
                        stated_encoding = line[7:-1].lower()
                        if stated_encoding == "ansel":
                            return "gedcom"
                        return stated_encoding
        except UnicodeError:
            pass
    return None




[docs]
class ParsingError(Exception):
    """Error raise by :py:func:`.strict_parse`."""




[docs]
class NothingParsedError(ParsingError):
    """Raised by :py:func:`.strict_parse` when the resulting document is empty."""



@dataclass

[docs]
class MalformedError(ParsingError):
    """Raised by :py:func:`.strict_parse` when there is warnings."""

[docs]
    warnings: list[ParsingWarning]





[docs]
def strict_parse(file: str | Path) -> Document:
    """Open and parse the gedcom file.
    Return the :py:class:`.Document` representing the gedcom file.

    Raise :py:exc:`.NothingParsed` when the input is empty or isn't gedcom.
    Raise :py:exc:`.MalformedError` when an error occurs in the parsing process.
    """
    with open(file, "r", encoding=guess_encoding(file)) as f:
        document, warnings = parse(f)
    if warnings:
        raise MalformedError(warnings)
    if len(document.records) == 0:
        raise NothingParsedError()
    return document