Source code for textmate_grammar.elements

from __future__ import annotations

from abc import ABC
from collections import defaultdict
from itertools import groupby
from pprint import pprint
from typing import TYPE_CHECKING, Generator

from .handler import POS, ContentHandler, Match, Pattern
from .utils.logger import LOGGER

if TYPE_CHECKING:
    from .parser import GrammarParser


TOKEN_DICT = dict[POS, list[str]]


[docs] class Capture: """A captured matching group. After mathing, any pattern can have a number of capture groups for which subsequent parsers can be defined. The Capture object stores this subsequent parse to be dispatched at a later moment. """ def __init__( self, handler: ContentHandler, pattern: Pattern, matching: Match, parsers: dict[int, GrammarParser], starting: tuple[int, int], boundary: tuple[int, int], key: str = "", **kwargs, ): """ Initialize a new instance of the Element class. :param handler: The content handler for the element. :param pattern: The pattern used for matching. :param matching: The match object. :param parsers: A dictionary of grammar parsers. :param starting: The starting position of the element. :param boundary: The boundary position of the element. :param key: The key for the element. Defaults to "". :param **kwargs: Additional keyword arguments. :returns: None """ self.handler = handler self.pattern = pattern self.matching = matching self.parsers = parsers self.starting = starting self.boundary = boundary self.key = key self.kwargs = kwargs def __eq__(self, other: object) -> bool: if isinstance(other, Capture): return bool( self.key == other.key and self.starting == other.starting and self.matching.group() == other.matching.group() ) else: return False def __repr__(self) -> str: return f"@capture<{self.key}>"
[docs] def dispatch(self) -> list[Capture | ContentElement]: """Dispatches the remaining parse of the capture group. This method iterates over the defined parsers for the capture group and dispatches the remaining parse based on the captured elements. It returns a list of captured elements or captures. :return: A list of Capture or ContentElement objects representing the parsed elements. """ elements = [] for group_id, parser in self.parsers.items(): if group_id > self.pattern.number_of_captures(): LOGGER.warning( f"The capture group {group_id} does not exist in pattern {self.pattern._pattern}" ) continue group_span = self.matching.span(group_id) # Empty group if group_span[0] == group_span[1]: continue group_starting = (self.starting[0], group_span[0]) group_boundary = (self.starting[0], group_span[1]) if ( parser == self and group_starting == self.starting and group_boundary == self.boundary ): LOGGER.warning("Parser loop detected, continuing...", self, self.starting) continue # Dispatch the parse self.kwargs.pop("greedy", None) parsed, captured_elements, _ = parser._parse( self.handler, starting=group_starting, boundary=group_boundary, find_one=self.kwargs.pop("find_one", False), parent_capture=self, **self.kwargs, ) if parsed: elements.extend(captured_elements) return elements
def _dispatch_list( pending_elements: list[Capture | ContentElement], parent: ContentElement | None = None ) -> list[ContentElement]: """Dispatches all captured parsers in the list.""" elements = [] for item in pending_elements: if isinstance(item, Capture): captured_elements: list[ContentElement] = _dispatch_list(item.dispatch()) elements.extend(captured_elements) elif item != parent: elements.append(item) for element in elements: element.parent = parent return elements def _str_to_list(input: str | list[str]) -> list[str]: if isinstance(input, str): return [input] if input else [] else: return input
[docs] class ContentElement: """The parsed grammar element.""" def __init__( self, token: str, grammar: dict, content: str, characters: dict[POS, str], children: list[Capture | ContentElement] | None = None, ) -> None: """ Initialize a new instance of the Element class. :param token: The token associated with the element. :param grammar: The grammar associated with the element. :param content: The content associated with the element. :param characters: The characters associated with the element. :param children: The children associated with the element. Defaults to None. """ if children is None: children = [] self.token = token self.grammar = grammar self.content = content self.characters = characters self._children_captures = children self._dispatched: bool = False self.parent: ContentElement | None = None @property def _subelements(self) -> list[ContentElement]: return self.children @property def children(self) -> list[ContentElement]: """ Returns a list of children elements. If the elements have not been dispatched yet, this method will dispatch them before returning. :return: A list of ContentElement objects representing the children elements. """ if not self._dispatched: self._dispatch() return self._children def _dispatch(self, nested: bool = False): """ Dispatches the content element and its children. :param nested: Indicates whether the dispatch is nested within another dispatch. :type nested: bool :return: None """ if self._dispatched: return self._dispatched = True self._children: list[ContentElement] = _dispatch_list(self._children_captures, parent=self) self._children_captures = [] if nested: for child in self._children: child._dispatch(True) def __eq__(self, other): if not isinstance(other, ContentElement): return False return bool(self.grammar == other.grammar and self.characters == other.characters) def _find( self, tokens: str | list[str], start_tokens: str | list[str] = "", hide_tokens: str | list[str] = "", stop_tokens: str | list[str] = "", depth: int = -1, attribute: str = "_subelements", stack: list[str] | None = None, ) -> Generator[tuple[ContentElement, list[str]], None, None]: tokens = _str_to_list(tokens) start_tokens = _str_to_list(start_tokens) hide_tokens = _str_to_list(hide_tokens) stop_tokens = _str_to_list(stop_tokens) if not set(tokens).isdisjoint(set(stop_tokens)): raise ValueError("Input tokens and stop_tokens must be disjoint") if stack is None: stack = [] stack += [self.token] start_found = not start_tokens if depth: depth -= 1 children: list[ContentElement] = getattr(self, attribute, self._subelements) for child in children: if stop_tokens and ( child.token in stop_tokens or (stop_tokens == ["*"] and child.token not in tokens) ): return None if not start_found and child.token in start_tokens: start_found = True start_tokens = [] if ( start_found and (child.token in tokens or tokens == ["*"]) and child.token not in hide_tokens ): yield child, [e for e in stack] if depth: nested_generator = child._find( tokens, start_tokens=start_tokens, hide_tokens=hide_tokens, stop_tokens=stop_tokens, depth=depth - 1, stack=[e for e in stack], ) yield from nested_generator return None
[docs] def find( self, tokens: str | list[str], start_tokens: str | list[str] = "", hide_tokens: str | list[str] = "", stop_tokens: str | list[str] = "", depth: int = -1, attribute: str = "_subelements", ) -> Generator[tuple[ContentElement, list[str]], None, None]: """ Find content elements based on the given criteria. The find method will return a generator that globs though the element-tree, searching for the next subelement that matches the given token. :param tokens: The tokens to search for. Can be a single token or a list of tokens. :param start_tokens: The tokens that mark the start of the search. Can be a single token or a list of tokens. :param hide_tokens: The tokens to hide from the search results. Can be a single token or a list of tokens. :param stop_tokens: The tokens that mark the end of the search. Can be a single token or a list of tokens. :param depth: The maximum depth to search. Defaults to -1 (unlimited depth). :param attribute: The attribute name to access the subelements. Defaults to "_subelements". :yield: A tuple containing the found content element and the stack of tokens encountered. :raises ValueError: If the input tokens and stop_tokens are not disjoint. :return: None if no matching content elements are found. """ return self._find( tokens, start_tokens=start_tokens, hide_tokens=hide_tokens, stop_tokens=stop_tokens, depth=depth, attribute=attribute, )
[docs] def findall( self, tokens: str | list[str], start_tokens: str | list[str] = "", hide_tokens: str | list[str] = "", stop_tokens: str | list[str] = "", depth: int = -1, attribute: str = "_subelements", ) -> list[tuple[ContentElement, list[str]]]: """ Find all occurrences of the specified tokens within the content element. :param tokens: The tokens to search for. :param start_tokens: The tokens that must appear before the found tokens. Defaults to "". :param hide_tokens: The tokens that should be hidden from the search. Defaults to "". :param stop_tokens: The tokens that, if found, should stop the search. Defaults to "". :param depth: The maximum depth to search. Defaults to -1 (unlimited depth). :param attribute: The attribute to search within. Defaults to "_subelements". :return: A list of tuples containing the content element and the found tokens. """ return list( self._find( tokens, start_tokens=start_tokens, hide_tokens=hide_tokens, stop_tokens=stop_tokens, depth=depth, attribute=attribute, ) )
[docs] def to_dict(self, depth: int = -1, all_content: bool = False, **kwargs) -> dict: """ Converts the object to a dictionary. :param depth: The depth of the conversion. Defaults to -1. :param all_content: Whether to include all content or only the top-level content. Defaults to False. :return: The converted dictionary representation of the object. """ out_dict = {"token": self.token} if all_content or not self.children: out_dict["content"] = self.content if self.children: out_dict["children"] = ( self._list_property_to_dict("children", depth=depth - 1, all_content=all_content) if depth else self.children ) return out_dict
[docs] def flatten(self) -> list[tuple[tuple[int, int], str, list[str]]]: """ Converts the object to a flattened array of tokens per index, similarly to vscode-textmate. :return: A list of tuples representing the flattened tokens. Each tuple contains: - A tuple representing the starting and ending index of the token. - The content of the token. - A list of keys associated with the token. """ token_dict = self._token_by_index(defaultdict(list)) tokens = [] for (_, key), group in groupby(sorted(token_dict.items()), lambda x: (x[0][0], x[1])): group_tokens = list(group) starting = group_tokens[0][0] content = "" for pos, _ in group_tokens: content += self.characters[pos] if content: tokens.append((starting, content, key)) return tokens
[docs] def print( self, flatten: bool = False, depth: int = -1, all_content: bool = False, **kwargs, ) -> None: """ Prints the current object recursively by converting it to a dictionary or a flattened array. :param flatten: If True, flattens the object before printing. Defaults to False. :param depth: The maximum depth to print. Defaults to -1 (unlimited depth). :param all_content: If True, includes all content in the printout. Defaults to False. :param **kwargs: Additional keyword arguments to be passed to the pprint function. :return: None """ if flatten: pprint( self.flatten(**kwargs), sort_dicts=False, width=kwargs.pop("width", 150), **kwargs, ) else: pprint( self.to_dict(depth=depth, all_content=all_content, **kwargs), sort_dicts=False, width=kwargs.pop("width", 150), **kwargs, )
def _token_by_index(self, token_dict: TOKEN_DICT | None = None) -> TOKEN_DICT: """Recursively tokenize every index between start and close. This method recursively tokenizes every index between the start and close positions of the element. It populates a dictionary, `token_dict`, with the tokens corresponding to each index. :param token_dict: A dictionary to store the tokens. If None, a new dictionary is created. :type token_dict: dict | None :return: A dictionary containing the tokens for each index. :rtype: dict """ if token_dict is None: token_dict = defaultdict(list) for pos in self.characters: token_dict[pos].append(self.token) # Tokenize child elements for element in self.children: element._token_by_index(token_dict) return token_dict def _list_property_to_dict(self, prop: str, **kwargs): """Makes a dictionary from a property.""" return [ item.to_dict(**kwargs) if isinstance(item, ContentElement) else item for item in getattr(self, prop, []) ] def __repr__(self) -> str: content = self.content if len(self.content) < 15 else self.content[:15] + "..." return repr(f"{self.token}<<{content}>>({len(self.children)})")
[docs] class ContentBlockElement(ContentElement): """A parsed element with a begin and a end""" def __init__( self, *args, begin: list[Capture | ContentElement] | None = None, end: list[Capture | ContentElement] | None = None, **kwargs, ) -> None: """ Initialize a new instance of the Element class. :param begin: A list of Capture or ContentElement objects representing the beginning captures of the element. Defaults to None. :param end: A list of Capture or ContentElement objects representing the ending captures of the element. Defaults to None. :param **kwargs: Additional keyword arguments to be passed to the parent class constructor. :return: None """ if end is None: end = [] if begin is None: begin = [] super().__init__(*args, **kwargs) self._begin_captures = begin self._end_captures = end @property def _subelements(self) -> list[ContentElement]: return self.begin + self.children + self.end @property def begin(self) -> list[ContentElement]: """ Returns the list of begin elements. If the elements have not been dispatched yet, this method will dispatch them before returning. :return: The list of begin elements. """ if not self._dispatched: self._dispatch() return self._begin @property def end(self) -> list[ContentElement]: """ Returns the end elements. If the elements have not been dispatched yet, this method will dispatch them before returning. :return: A list of end elements. """ if not self._dispatched: self._dispatch() return self._end def _dispatch(self, nested: bool = False): if self._dispatched: return super()._dispatch(nested) self._begin: list[ContentElement] = _dispatch_list(self._begin_captures, parent=self) self._end: list[ContentElement] = _dispatch_list(self._end_captures, parent=self) self._begin_captures, self._end_captures = [], [] if nested: for item in self._begin: item._dispatch(True) for item in self._end: item._dispatch(True)
[docs] def to_dict(self, depth: int = -1, all_content: bool = False, **kwargs) -> dict: """ Converts the element to a dictionary representation. :param depth: The depth of the conversion. Defaults to -1. :param all_content: Whether to include all content. Defaults to False. :param **kwargs: Additional keyword arguments. :return: The dictionary representation of the element. """ out_dict = super().to_dict(depth=depth, all_content=all_content, **kwargs) if self.begin: out_dict["begin"] = ( self._list_property_to_dict("begin", depth=depth - 1, **kwargs) if depth else self.begin ) if self.end: out_dict["end"] = ( self._list_property_to_dict("end", depth=depth - 1, **kwargs) if depth else self.end ) ordered_keys = [ key for key in ["token", "begin", "end", "content", "children"] if key in out_dict ] ordered_dict = {key: out_dict[key] for key in ordered_keys} return ordered_dict
def _token_by_index(self, token_dict: TOKEN_DICT | None = None) -> TOKEN_DICT: """Converts the object to a flattened array of tokens.""" if token_dict is None: token_dict = defaultdict(list) super()._token_by_index(token_dict) for element in self.begin: element._token_by_index(token_dict) for element in self.end: element._token_by_index(token_dict) return token_dict