from __future__ import annotations
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
import onigurumacffi as re
from .elements import Capture, ContentBlockElement, ContentElement
from .handler import POS, ContentHandler, Pattern
from .utils.exceptions import IncludedParserNotFound
from .utils.logger import LOGGER, track_depth
if TYPE_CHECKING:
from .parsers.base import LanguageParser
[docs]
class GrammarParser(ABC):
"""The abstract grammar parser object"""
[docs]
@staticmethod
def initialize(grammar: dict, **kwargs):
"""
Initializes the parser based on the grammar.
:param grammar: The grammar to initialize the parser with.
:param kwargs: Additional keyword arguments.
:return: The initialized parser.
"""
if "include" in grammar:
return grammar["include"]
elif "match" in grammar:
return MatchParser(grammar, **kwargs)
elif "begin" in grammar and "end" in grammar:
return BeginEndParser(grammar, **kwargs)
elif "begin" in grammar and "while" in grammar:
return BeginWhileParser(grammar, **kwargs)
elif "patterns" in grammar:
return PatternsParser(grammar, **kwargs)
else:
return TokenParser(grammar, **kwargs)
def __init__(
self,
grammar: dict,
language_parser: LanguageParser | None = None,
key: str = "",
is_capture: bool = False,
**kwargs,
) -> None:
"""
Initialize a Parser object.
:param grammar: The grammar dictionary.
:param language: The language parser object. Defaults to None.
:param key: The key for the parser. Defaults to "".
:param is_capture: Indicates if the parser is a capture. Defaults to False.
:param kwargs: Additional keyword arguments.
:return: None
"""
self.grammar = grammar
self.language_parser = language_parser
self.key = key
self.token = grammar.get("name", "")
self.is_capture = is_capture
self.initialized = False
self.anchored = False
@property
def comment(self) -> str:
return self.grammar.get("comment", "")
@property
def disabled(self) -> bool:
return self.grammar.get("disabled", False)
def __repr__(self) -> str:
return f"{self.__class__.__name__}:<{self.key}>"
def _init_captures(self, grammar: dict, key: str = "captures", **kwargs) -> dict:
"""Initializes a captures dictionary"""
captures = {}
if key in grammar:
for group_id, pattern in grammar[key].items():
captures[int(group_id)] = self.initialize(
pattern, language_parser=self.language_parser, is_capture=True
)
return captures
def _find_include(self, key: str, **kwargs) -> GrammarParser:
"""Find the included grammars and during repository initialization"""
if not self.language_parser:
raise IncludedParserNotFound(key)
if key in ["$self", "$base"]: # TODO there is a difference between these
return self.language_parser
elif key[0] == "#":
return self.language_parser.repository.get(key[1:], None)
else:
return self.language_parser._find_include_scopes(key)
@abstractmethod
def _parse(
self,
handler: ContentHandler,
starting: POS,
**kwargs,
) -> tuple[bool, list[Capture | ContentElement], tuple[int, int] | None]:
"""The abstract method which all parsers much implement
The ``_parse`` method is called by ``parse``, which will additionally parse any nested Capture elements.
The ``_parse`` method should contain all the rules for the extended parser.
:param handler: The content handler to handle the parsed elements.
:param starting: The starting position of the parsing.
:param kwargs: Additional keyword arguments.
:return: A tuple containing the parsing result, a list of parsed elements, and the ending position of the parsing.
"""
pass
def _initialize_repository(self, **kwargs) -> None:
"""Initializes the repository's inclusions.
When the grammar has patterns, this method should called to initialize its inclusions.
This should occur after all sub patterns have been initialized.
"""
return
[docs]
def parse(
self,
handler: ContentHandler,
starting: POS = (0, 0),
boundary: POS | None = None,
**kwargs,
) -> tuple[bool, list[Capture | ContentElement], tuple[int, int] | None]:
"""
The method to parse a handler using the current grammar.
:param handler: The ContentHandler object that will handle the parsed content.
:param starting: The starting position for parsing. Defaults to (0, 0).
:param boundary: The boundary position for parsing. Defaults to None.
:param **kwargs: Additional keyword arguments that can be passed to the parser.
:return: A tuple containing:
- parsed: A boolean indicating whether the parsing was successful.
- elements: A list of Capture or ContentElement objects representing the parsed content.
- span: A tuple containing the starting and ending positions of the parsed content, or None if parsing failed.
"""
if not self.initialized and self.language_parser is not None:
self.language_parser._initialize_repository()
parsed, elements, span = self._parse(handler, starting, boundary=boundary, **kwargs)
return parsed, elements, span
[docs]
def match_and_capture(
self,
handler: ContentHandler,
pattern: Pattern,
starting: POS,
boundary: POS,
parsers: dict[int, GrammarParser] | None = None,
parent_capture: Capture | None = None,
**kwargs,
) -> tuple[tuple[POS, POS] | None, str, list[Capture | ContentElement]]:
"""Matches a pattern and its capture groups.
Matches the pattern on the handler between the starting and boundary positions. If a pattern is matched,
its capture groups are initialized as Capture objects. These are only parsed after the full handler has been
parsed. This occurs in GrammarParser.parse when calling parse_captures.
:param handler: The content handler to match the pattern on.
:param pattern: The pattern to match.
:param starting: The starting position for the match.
:param boundary: The boundary position for the match.
:param parsers: A dictionary of parsers.
:param parent_capture: The parent capture object.
:param kwargs: Additional keyword arguments.
:return: A tuple containing the span of the match, the matched string, and a list of capture objects or content elements.
"""
if parsers is None:
parsers = {}
matching, span = handler.search(pattern, starting=starting, boundary=boundary, **kwargs)
if matching:
if parsers:
capture = Capture(
handler,
pattern,
matching,
parsers,
starting,
boundary,
key=self.key,
**kwargs,
)
if parent_capture is not None and capture == parent_capture:
return None, "", []
else:
return span, matching.group(), [capture]
else:
return span, matching.group(), []
else:
return None, "", []
[docs]
class TokenParser(GrammarParser):
"""The parser for grammars for which only the token is provided."""
def __init__(self, grammar: dict, **kwargs) -> None:
super().__init__(grammar, **kwargs)
self.initialized = True
def __repr__(self) -> str:
return f"{self.__class__.__name__}:{self.token}"
@track_depth
def _parse(
self,
handler: ContentHandler,
starting: POS,
boundary: POS,
**kwargs,
) -> tuple[bool, list[Capture | ContentElement], tuple[POS, POS] | None]:
"""The parse method for grammars for which only the token is provided.
When no regex patterns are provided. The element is created between the initial and boundary positions.
"""
content = handler.read_pos(starting, boundary)
elements: list[Capture | ContentElement] = [
ContentElement(
token=self.token,
grammar=self.grammar,
content=content,
characters=handler.chars(starting, boundary),
)
]
handler.anchor = boundary[1]
LOGGER.info(
f"{self.__class__.__name__} found < {repr(content)} >",
self,
starting,
kwargs.get("depth", 0),
)
return True, elements, (starting, boundary)
[docs]
class MatchParser(GrammarParser):
"""The parser for grammars for which a match pattern is provided."""
def __init__(self, grammar: dict, **kwargs) -> None:
super().__init__(grammar, **kwargs)
self.exp_match = re.compile(grammar["match"])
self.parsers = self._init_captures(grammar, key="captures")
if "\\G" in grammar["match"]:
self.anchored = True
def __repr__(self) -> str:
if self.token:
return f"{self.__class__.__name__}:{self.token}"
else:
identifier = self.key if self.key else "_".join(self.comment.lower().split(" "))
return f"{self.__class__.__name__}:<{identifier}>"
def _initialize_repository(self, **kwargs) -> None:
"""When the grammar has patterns, this method should called to initialize its inclusions."""
self.initialized = True
for key, value in self.parsers.items():
if not isinstance(value, GrammarParser):
self.parsers[key] = self._find_include(value)
for parser in self.parsers.values():
if not parser.initialized:
parser._initialize_repository()
@track_depth
def _parse(
self,
handler: ContentHandler,
starting: POS,
boundary: POS,
**kwargs,
) -> tuple[bool, list[Capture | ContentElement], tuple[POS, POS] | None]:
"""The parse method for grammars for which a match pattern is provided."""
span, content, captures = self.match_and_capture(
handler,
pattern=self.exp_match,
starting=starting,
boundary=boundary,
parsers=self.parsers,
**kwargs,
)
if span is None:
LOGGER.debug(
f"{self.__class__.__name__} no match",
self,
starting,
kwargs.get("depth", 0),
)
return False, [], None
LOGGER.info(
f"{self.__class__.__name__} found < {repr(content)} >",
self,
starting,
kwargs.get("depth", 0),
)
if self.token:
elements: list[Capture | ContentElement] = [
ContentElement(
token=self.token,
grammar=self.grammar,
content=content,
characters=handler.chars(*span),
children=captures,
)
]
else:
elements = captures
return True, elements, span
[docs]
class ParserHasPatterns(GrammarParser, ABC):
def __init__(self, grammar: dict, **kwargs) -> None:
super().__init__(grammar, **kwargs)
self.patterns = [
self.initialize(pattern, language_parser=self.language_parser)
for pattern in grammar.get("patterns", [])
]
def _initialize_repository(self):
"""When the grammar has patterns, this method should called to initialize its inclusions."""
self.initialized = True
self.patterns = [
parser if isinstance(parser, GrammarParser) else self._find_include(parser)
for parser in self.patterns
]
for parser in self.patterns:
if not parser.initialized:
parser._initialize_repository()
# Copy patterns from included pattern parsers
pattern_parsers = [parser for parser in self.patterns if isinstance(parser, PatternsParser)]
for parser in pattern_parsers:
parser_index = self.patterns.index(parser)
self.patterns[parser_index : parser_index + 1] = parser.patterns
# Injection grammars
for exception_scopes, injection_pattern in self.language_parser.injections:
if self.token:
if self.token.split(".")[0] not in exception_scopes:
self.patterns.append(injection_pattern)
elif self.is_capture:
self.patterns.append(injection_pattern)
[docs]
class PatternsParser(ParserHasPatterns):
"""The parser for grammars for which several patterns are provided."""
@track_depth
def _parse(
self,
handler: ContentHandler,
starting: POS,
boundary: POS | None = None,
greedy: bool = False,
find_one: bool = True,
**kwargs,
) -> tuple[bool, list[Capture | ContentElement], tuple[POS, POS]]:
"""The parse method for grammars for which a match pattern is provided."""
if boundary is None:
boundary = (len(handler.lines) - 1, handler.line_lengths[-1])
parsed = False
elements: list[Capture | ContentElement] = []
patterns = [parser for parser in self.patterns if not parser.disabled]
current = (starting[0], starting[1])
while current < boundary:
for parser in patterns:
# Try to find patterns
parsed, captures, span = parser._parse(
handler,
current,
boundary=boundary,
greedy=greedy,
**kwargs,
)
if parsed:
if find_one:
LOGGER.info(
f"{self.__class__.__name__} found single element",
self,
current,
kwargs.get("depth", 0),
)
return True, captures, span
elements.extend(captures)
current = span[1]
break
else:
if find_one:
break
if not parsed and not greedy:
# Try again if previously allowed no leading white space charaters, only when multple patterns are to be found
options_span, options_elements = {}, {}
for parser in patterns:
parsed, captures, span = parser._parse(
handler,
current,
boundary=boundary,
greedy=True,
**kwargs,
)
if parsed:
options_span[parser] = span
options_elements[parser] = captures
LOGGER.debug(
f"{self.__class__.__name__} found pattern choice",
self,
current,
kwargs.get("depth", 0),
)
if options_span:
parser = sorted(
options_span,
key=lambda parser: (
*options_span[parser][0],
patterns.index(parser),
),
)[0]
current = options_span[parser][1]
elements.extend(options_elements[parser])
LOGGER.info(
f"{self.__class__.__name__} chosen pattern of {parser}",
self,
current,
kwargs.get("depth", 0),
)
elif self != self.language_parser:
break
else:
remainder = handler.read_line(current)
if not remainder.isspace():
LOGGER.warning(
f"{self.__class__.__name__} remainder of line not parsed: {remainder}",
self,
current,
kwargs.get("depth", 0),
)
if current[0] + 1 <= len(handler.lines):
current = (current[0] + 1, 0)
else:
LOGGER.debug(
f"{self.__class__.__name__} EOF encountered",
self,
current,
kwargs.get("depth", 0),
)
break
if current == starting:
LOGGER.warning(
f"{self.__class__.__name__} handler did not move after a search round",
self,
starting,
kwargs.get("depth", 0),
)
break
line_length = handler.line_lengths[current[0]]
if current[1] in [line_length, line_length - 1]:
try:
empty_lines = next(
i for i, v in enumerate(handler.line_lengths[current[0] + 1 :]) if v > 1
)
current = (current[0] + 1 + empty_lines, 0)
except StopIteration:
break
if self.token:
elements = [
ContentElement(
token=self.token,
grammar=self.grammar,
content=handler.read_pos(starting, boundary),
characters=handler.chars(starting, boundary),
children=elements,
)
]
return bool(elements), elements, (starting, current)
[docs]
class BeginEndParser(ParserHasPatterns):
"""The parser for grammars for which a begin/end pattern is provided."""
def __init__(self, grammar: dict, **kwargs) -> None:
super().__init__(grammar, **kwargs)
if "contentName" in grammar:
self.token = grammar["contentName"]
self.between_content = True
else:
self.token = grammar.get("name")
self.between_content = False
self.apply_end_pattern_last = grammar.get("applyEndPatternLast", False)
self.exp_begin = re.compile(grammar["begin"])
self.exp_end = re.compile(grammar["end"])
self.parsers_begin = self._init_captures(grammar, key="beginCaptures")
self.parsers_end = self._init_captures(grammar, key="endCaptures")
if "\\G" in grammar["begin"]:
self.anchored = True
def __repr__(self) -> str:
if self.token:
return f"{self.__class__.__name__}:{self.token}"
else:
identifier = self.key if self.key else "_".join(self.comment.lower().split(" "))
return f"{self.__class__.__name__}:<{identifier}>"
def _initialize_repository(self, **kwargs) -> None:
"""When the grammar has patterns, this method should called to initialize its inclusions."""
self.initialized = True
super()._initialize_repository()
for key, value in self.parsers_end.items():
if not isinstance(value, GrammarParser):
self.parsers_end[key] = self._find_include(value)
for key, value in self.parsers_begin.items():
if not isinstance(value, GrammarParser):
self.parsers_begin[key] = self._find_include(value)
for parser in self.parsers_begin.values():
if not parser.initialized:
parser._initialize_repository()
for parser in self.parsers_end.values():
if not parser.initialized:
parser._initialize_repository()
@track_depth
def _parse(
self,
handler: ContentHandler,
starting: POS,
boundary: POS,
greedy: bool = False,
**kwargs,
) -> tuple[bool, list[Capture | ContentElement], tuple[POS, POS] | None]:
"""The parse method for grammars for which a begin/end pattern is provided."""
begin_span, _, begin_elements = self.match_and_capture(
handler,
self.exp_begin,
starting,
boundary=boundary,
parsers=self.parsers_begin,
greedy=greedy,
**kwargs,
)
if not begin_span:
LOGGER.debug(
f"{self.__class__.__name__} no begin match",
self,
starting,
kwargs.get("depth", 0),
)
return False, [], None
LOGGER.info(
f"{self.__class__.__name__} found begin",
self,
starting,
kwargs.get("depth", 0),
)
# Get initial and boundary positions
current = begin_span[1]
if boundary is None:
boundary = (len(handler.lines) - 1, handler.line_lengths[-1])
# Define loop parameters
end_elements: list[Capture | ContentElement] = []
mid_elements: list[Capture | ContentElement] = []
patterns = [parser for parser in self.patterns if not parser.disabled]
first_run = True
while current <= boundary:
parsed = False
# Create boolean that is enabled when a parser is recursively called. In this its end pattern should
# be applied last, otherwise the same span will be recognzed as the end pattern by the upper level parser
apply_end_pattern_last = False
# Try to find patterns first with no leading whitespace charaters allowed
for parser in patterns:
parsed, capture_elements, capture_span = parser._parse(
handler, current, boundary=boundary, greedy=False, **kwargs
)
if parsed:
if parser == self:
apply_end_pattern_last = True
LOGGER.debug(
f"{self.__class__.__name__} found pattern (no ws)",
self,
current,
kwargs.get("depth", 0),
)
break
# Try to find the end pattern with no leading whitespace charaters allowed
end_span, _, end_elements = self.match_and_capture(
handler,
self.exp_end,
current,
boundary=boundary,
parsers=self.parsers_end,
greedy=False,
**kwargs,
)
if not parsed and not end_span:
# Try to find the patterns and end pattern allowing for leading whitespace charaters
LOGGER.info(
f"{self.__class__.__name__} getting all pattern options",
self,
current,
kwargs.get("depth", 0),
)
options_span, options_elements = {}, {}
for parser in patterns:
parsed, capture_elements, capture_span = parser._parse(
handler,
current,
boundary=boundary,
greedy=True,
**kwargs,
)
if parsed:
options_span[parser] = capture_span
options_elements[parser] = capture_elements
LOGGER.debug(
f"{self.__class__.__name__} found pattern choice",
self,
current,
kwargs.get("depth", 0),
)
if options_span:
parsed = True
parser = sorted(
options_span,
key=lambda parser: (
*options_span[parser][0],
patterns.index(parser),
),
)[0]
capture_span = options_span[parser]
capture_elements = options_elements[parser]
if parser == self:
apply_end_pattern_last = True
LOGGER.info(
f"{self.__class__.__name__} chosen pattern of {parser}",
self,
current,
kwargs.get("depth", 0),
)
end_span, end_content, end_elements = self.match_and_capture(
handler,
self.exp_end,
current,
boundary=boundary,
parsers=self.parsers_end,
greedy=True,
**kwargs,
)
if end_span:
if parsed:
# Check whether the capture pattern has the same closing positions as the end pattern
capture_before_end = handler.prev(capture_span[1])
if handler.read(capture_before_end, skip_newline=False) == "\n":
# If capture pattern ends with \n, both left and right of \n is considered end
pattern_at_end = end_span[1] in [
capture_before_end,
capture_span[1],
]
else:
pattern_at_end = end_span[1] == capture_span[1]
end_before_pattern = end_span[0] <= capture_span[0]
empty_span_end = end_span[1] == end_span[0]
if pattern_at_end and (end_before_pattern or empty_span_end):
if empty_span_end:
# Both found capture pattern and end pattern are accepted, break pattern search
LOGGER.debug(
f"{self.__class__.__name__} capture+end: both accepted, break",
self,
current,
kwargs.get("depth", 0),
)
mid_elements.extend(capture_elements)
closing = end_span[0] if self.between_content else end_span[1]
break
elif not self.apply_end_pattern_last and not apply_end_pattern_last:
# End pattern prioritized over capture pattern, break pattern search
LOGGER.debug(
f"{self.__class__.__name__} capture+end: end prioritized, break",
self,
current,
kwargs.get("depth", 0),
)
closing = end_span[0] if self.between_content else end_span[1]
break
else:
# Capture pattern prioritized over end pattern, continue pattern search
LOGGER.debug(
f"{self.__class__.__name__} capture+end: capture prioritized, continue",
self,
current,
kwargs.get("depth", 0),
)
mid_elements.extend(capture_elements)
current = capture_span[1]
elif capture_span[0] < end_span[0]:
# Capture pattern found before end pattern, continue pattern search
LOGGER.debug(
f"{self.__class__.__name__} capture<end: leading capture, continue",
self,
current,
kwargs.get("depth", 0),
)
mid_elements.extend(capture_elements)
current = capture_span[1]
else:
# End pattern found before capture pattern, break pattern search
LOGGER.debug(
f"{self.__class__.__name__} end<capture: leading end, break",
self,
current,
kwargs.get("depth", 0),
)
closing = end_span[0] if self.between_content else end_span[1]
break
else:
# No capture pattern found, accept end pattern and break pattern search
LOGGER.debug(
f"{self.__class__.__name__} end: break",
self,
current,
kwargs.get("depth", 0),
)
closing = end_span[0] if self.between_content else end_span[1]
break
else: # No end pattern found
if parsed:
# Append found capture pattern and find next starting position
mid_elements.extend(capture_elements)
if handler.read(capture_span[1], skip_newline=False) == "\n":
# Next character after capture pattern is newline
LOGGER.debug(
f"{self.__class__.__name__} capture: next is newline, continue",
self,
current,
kwargs.get("depth", 0),
)
end_span, _, _ = self.match_and_capture(
handler,
self.exp_end,
capture_span[1],
boundary=boundary,
parsers=self.parsers_end,
allow_leading_all=False,
**kwargs,
)
if end_span and end_span[1] <= handler.next(capture_span[1]):
# Potential end pattern can be found directly after the found capture pattern
current = capture_span[1]
else:
# Skip the newline character in the next pattern search round
current = handler.next(capture_span[1])
else:
LOGGER.debug(
f"{self.__class__.__name__} capture: continue",
self,
current,
kwargs.get("depth", 0),
)
current = capture_span[1]
else:
# No capture patterns nor end patterns found. Skip the current line.
line = handler.read_line(current)
if line and not line.isspace():
LOGGER.warning(
f"No patterns found in line, skipping < {repr(line)} >",
self,
current,
kwargs.get("depth", 0),
)
current = handler.next((current[0], handler.line_lengths[current[0]]))
if apply_end_pattern_last:
current = handler.next(current)
if first_run:
# Skip all parsers that were anchored to the begin pattern after the first round
patterns = [parser for parser in patterns if not parser.anchored]
first_run = False
else:
# Did not break out of while loop, set closing to boundary
closing = boundary
end_span = ((0, 0), boundary)
start = begin_span[1] if self.between_content else begin_span[0]
content = handler.read_pos(start, closing)
LOGGER.info(
f"{self.__class__.__name__} found < {repr(content)} >",
self,
start,
kwargs.get("depth", 0),
)
# Construct output elements
if self.token:
elements: list[Capture | ContentElement] = [
ContentBlockElement(
token=self.token,
grammar=self.grammar,
content=content,
characters=handler.chars(start, closing),
children=mid_elements,
begin=begin_elements,
end=end_elements,
)
]
else:
elements = begin_elements + mid_elements + end_elements
return True, elements, (begin_span[0], end_span[1])
[docs]
class BeginWhileParser(PatternsParser):
"""The parser for grammars for which a begin/end pattern is provided."""
def __init__(self, grammar: dict, **kwargs) -> None:
super().__init__(grammar, **kwargs)
if "contentName" in grammar:
self.token = grammar["contentName"]
self.between_content = True
else:
self.token = grammar.get("name")
self.between_content = False
self.exp_begin = re.compile(grammar["begin"])
self.exp_while = re.compile(grammar["while"])
self.parsers_begin = self._init_captures(grammar, key="beginCaptures")
self.parsers_while = self._init_captures(grammar, key="whileCaptures")
def __repr__(self) -> str:
if self.token:
return f"{self.__class__.__name__}:{self.token}"
else:
identifier = self.key if self.key else "_".join(self.comment.lower().split(" "))
return f"{self.__class__.__name__}:<{identifier}>"
def _initialize_repository(self):
"""When the grammar has patterns, this method should called to initialize its inclusions."""
self.initialized = True
super()._initialize_repository()
for key, value in self.parsers_end.items():
if not isinstance(value, GrammarParser):
self.parsers_end[key] = self._find_include(value)
for key, value in self.parsers_while.items():
if not isinstance(value, GrammarParser):
self.parsers_while[key] = self._find_include(value)
for parser in self.parsers_begin.values():
if not parser.initialized:
parser._initialize_repository()
for parser in self.parsers_while.values():
if not parser.initialized:
parser._initialize_repository()
def _parse(
self,
handler: ContentHandler,
starting: POS,
**kwargs,
):
"""The parse method for grammars for which a begin/while pattern is provided."""
raise NotImplementedError