Source code for yapcad.dsl.lexer

"""
Lexer for the yapCAD DSL v2 (Pythonic Syntax).

Converts source text into a stream of tokens for the parser.
Supports:
- Python-style indentation (INDENT/DEDENT tokens)
- Significant newlines (NEWLINE tokens)
- Implicit line continuation inside brackets
- Single-line comments (#)
- Multi-line comments (/* */)
- String literals with escape sequences
- Multi-line strings (triple quotes)
- Integer literals (decimal, hex, binary)
- Float literals (including scientific notation)
- All DSL keywords and operators
"""

from typing import List, Optional, Iterator
from .tokens import (
    Token, TokenType, SourceLocation, SourceSpan, KEYWORDS,
    is_deprecated_keyword, get_deprecation_message
)
from .errors import (
    LexerError,
    error_unexpected_character,
    error_unterminated_string,
    error_unterminated_multiline_string,
    error_unterminated_comment,
    error_invalid_escape_sequence,
    error_invalid_number_literal,
    error_invalid_hex_literal,
    error_invalid_binary_literal,
)



[docs]
class Lexer:
    """
    Tokenizer for the yapCAD DSL v2 with Python-style indentation.

    This lexer generates INDENT and DEDENT tokens based on changes in
    leading whitespace, similar to Python's tokenizer.

    Usage:
        lexer = Lexer(source_code)
        tokens = lexer.tokenize()

    Or for streaming:
        lexer = Lexer(source_code)
        for token in lexer:
            process(token)
    """

    def __init__(self, source: str, filename: Optional[str] = None):
        self.source = source
        self.filename = filename
        self.pos = 0            # Current position in source
        self.line = 1           # Current line (1-indexed)
        self.column = 1         # Current column (1-indexed)
        self.line_start = 0     # Position of current line start
        self._lines: Optional[List[str]] = None  # Cached line list

        # Indentation tracking
        self.indent_stack = [0]  # Stack of indentation levels (starts at 0)
        self.at_line_start = True  # Are we at the start of a line?
        self.pending_tokens: List[Token] = []  # Tokens to emit before next scan

        # Bracket nesting for implicit line continuation
        self.bracket_depth = 0  # Count of (, [, { nesting

    @property
    def lines(self) -> List[str]:
        """Lazy-load line list for error reporting."""
        if self._lines is None:
            self._lines = self.source.splitlines()
        return self._lines


[docs]
    def get_source_line(self, line_num: int) -> Optional[str]:
        """Get a specific line of source (1-indexed)."""
        if 1 <= line_num <= len(self.lines):
            return self.lines[line_num - 1]
        return None


    def _location(self) -> SourceLocation:
        """Get current source location."""
        return SourceLocation(self.line, self.column, self.pos, self.filename)

    def _span(self, start: SourceLocation) -> SourceSpan:
        """Create a span from start to current position."""
        return SourceSpan(start, self._location())

    def _peek(self, offset: int = 0) -> str:
        """Look at character at current position + offset without consuming."""
        idx = self.pos + offset
        if idx >= len(self.source):
            return '\0'
        return self.source[idx]

    def _advance(self) -> str:
        """Consume and return current character."""
        if self.pos >= len(self.source):
            return '\0'
        ch = self.source[self.pos]
        self.pos += 1
        if ch == '\n':
            self.line += 1
            self.column = 1
            self.line_start = self.pos
            self.at_line_start = True
        else:
            self.column += 1
        return ch

    def _match(self, expected: str) -> bool:
        """Consume character if it matches expected."""
        if self._peek() == expected:
            self._advance()
            return True
        return False

    def _is_at_end(self) -> bool:
        """Check if we've reached end of source."""
        return self.pos >= len(self.source)

    def _skip_comment(self) -> None:
        """Skip a single-line comment (# to end of line)."""
        while self._peek() != '\n' and not self._is_at_end():
            self._advance()

    def _skip_multiline_comment(self) -> None:
        """Skip /* ... */ comment."""
        start = self._location()
        self._advance()  # consume '/'
        self._advance()  # consume '*'
        depth = 1  # Support nested comments

        while not self._is_at_end() and depth > 0:
            if self._peek() == '/' and self._peek(1) == '*':
                self._advance()
                self._advance()
                depth += 1
            elif self._peek() == '*' and self._peek(1) == '/':
                self._advance()
                self._advance()
                depth -= 1
            else:
                self._advance()

        if depth > 0:
            raise error_unterminated_comment(
                self._span(start),
                self.get_source_line(start.line)
            )

    def _handle_line_start(self) -> Optional[Token]:
        """
        Handle indentation at the start of a line.
        Returns a NEWLINE token if we have a significant newline,
        and queues INDENT/DEDENT tokens as needed.
        """
        if not self.at_line_start:
            return None

        # Calculate indentation (spaces and tabs)
        # Note: We treat tabs as single indent units, but warn about mixing
        start = self._location()
        indent = 0
        while self._peek() in ' \t':
            if self._peek() == ' ':
                indent += 1
            else:  # tab
                indent += 1  # Simple approach: tab = 1 indent unit
            self._advance()

        # Skip blank lines and comment-only lines
        if self._peek() == '\n' or self._peek() == '\0':
            self.at_line_start = True
            return None
        if self._peek() == '#':
            self._skip_comment()
            return None
        if self._peek() == '/' and self._peek(1) == '*':
            self._skip_multiline_comment()
            # After multiline comment, check if we're at line start again
            return None

        self.at_line_start = False

        # Inside brackets, indentation doesn't matter
        if self.bracket_depth > 0:
            return None

        current_indent = self.indent_stack[-1]

        if indent > current_indent:
            # Indent
            self.indent_stack.append(indent)
            return self._make_token(TokenType.INDENT, None, start, "")
        elif indent < current_indent:
            # Dedent - may need multiple DEDENT tokens
            dedent_tokens = []
            while self.indent_stack[-1] > indent:
                self.indent_stack.pop()
                dedent_tokens.append(
                    self._make_token(TokenType.DEDENT, None, start, "")
                )
            # Check for inconsistent dedent
            if self.indent_stack[-1] != indent:
                # Dedent to a level that was never indented to
                # This is an error in Python, but we'll be lenient
                self.indent_stack.append(indent)

            if dedent_tokens:
                # Queue all but the first
                self.pending_tokens.extend(dedent_tokens[1:])
                return dedent_tokens[0]

        return None

    def _skip_whitespace_within_line(self) -> None:
        """Skip horizontal whitespace (spaces and tabs), not newlines."""
        while self._peek() in ' \t':
            self._advance()

    def _make_token(self, token_type: TokenType, value, start: SourceLocation,
                    lexeme: Optional[str] = None) -> Token:
        """Create a token."""
        span = self._span(start)
        if lexeme is None:
            lexeme = self.source[start.offset:self.pos]
        return Token(token_type, value, lexeme, span)

    def _scan_string(self) -> Token:
        """Scan a string literal."""
        start = self._location()
        quote = self._advance()  # consume opening quote

        # Check for triple-quote (multi-line string)
        if self._peek() == quote and self._peek(1) == quote:
            self._advance()  # consume second quote
            self._advance()  # consume third quote
            return self._scan_multiline_string(start, quote)

        # Single-line string
        chars = []
        while not self._is_at_end() and self._peek() != quote:
            ch = self._peek()
            if ch == '\n':
                raise error_unterminated_string(
                    self._span(start),
                    self.get_source_line(start.line)
                )
            if ch == '\\':
                self._advance()  # consume backslash
                chars.append(self._scan_escape_sequence(start))
            else:
                chars.append(self._advance())

        if self._is_at_end():
            raise error_unterminated_string(
                self._span(start),
                self.get_source_line(start.line)
            )

        self._advance()  # consume closing quote
        value = ''.join(chars)
        return self._make_token(TokenType.STRING_LITERAL, value, start)

    def _scan_multiline_string(self, start: SourceLocation, quote: str) -> Token:
        """Scan a triple-quoted multi-line string."""
        chars = []
        while not self._is_at_end():
            if self._peek() == quote and self._peek(1) == quote and self._peek(2) == quote:
                self._advance()  # consume quotes
                self._advance()
                self._advance()
                value = ''.join(chars)
                return self._make_token(TokenType.STRING_LITERAL, value, start)

            ch = self._peek()
            if ch == '\\':
                self._advance()
                chars.append(self._scan_escape_sequence(start))
            else:
                chars.append(self._advance())

        raise error_unterminated_multiline_string(
            self._span(start),
            self.get_source_line(start.line)
        )

    def _scan_escape_sequence(self, string_start: SourceLocation) -> str:
        """Parse an escape sequence after backslash."""
        esc_start = self._location()
        if self._is_at_end():
            raise error_invalid_escape_sequence(
                "", self._span(esc_start), self.get_source_line(esc_start.line)
            )

        ch = self._advance()
        escape_chars = {
            'n': '\n',
            't': '\t',
            'r': '\r',
            '\\': '\\',
            '"': '"',
            "'": "'",
            '0': '\0',
        }

        if ch in escape_chars:
            return escape_chars[ch]
        elif ch == 'x':
            # Hex escape: \xHH
            hex_chars = self._advance() + self._advance()
            try:
                return chr(int(hex_chars, 16))
            except ValueError:
                raise error_invalid_escape_sequence(
                    f"x{hex_chars}", self._span(esc_start),
                    self.get_source_line(esc_start.line)
                )
        elif ch == 'u':
            # Unicode escape: \uHHHH
            hex_chars = ''
            for _ in range(4):
                hex_chars += self._advance()
            try:
                return chr(int(hex_chars, 16))
            except ValueError:
                raise error_invalid_escape_sequence(
                    f"u{hex_chars}", self._span(esc_start),
                    self.get_source_line(esc_start.line)
                )
        else:
            raise error_invalid_escape_sequence(
                ch, self._span(esc_start), self.get_source_line(esc_start.line)
            )

    def _scan_number(self) -> Token:
        """Scan a numeric literal (int or float)."""
        start = self._location()
        first_char = self._peek()

        # Check for hex or binary prefix
        if first_char == '0':
            self._advance()
            if self._peek() in 'xX':
                return self._scan_hex_number(start)
            elif self._peek() in 'bB':
                return self._scan_binary_number(start)
            # Otherwise it's a decimal starting with 0

        # Scan decimal digits
        while self._peek().isdigit():
            self._advance()

        # Check for float (decimal point)
        is_float = False
        if self._peek() == '.' and self._peek(1).isdigit():
            is_float = True
            self._advance()  # consume '.'
            while self._peek().isdigit():
                self._advance()

        # Check for scientific notation
        if self._peek() in 'eE':
            is_float = True
            self._advance()  # consume 'e'
            if self._peek() in '+-':
                self._advance()
            if not self._peek().isdigit():
                lexeme = self.source[start.offset:self.pos]
                raise error_invalid_number_literal(
                    lexeme, self._span(start), self.get_source_line(start.line)
                )
            while self._peek().isdigit():
                self._advance()

        lexeme = self.source[start.offset:self.pos]
        try:
            if is_float:
                value = float(lexeme)
                return self._make_token(TokenType.FLOAT_LITERAL, value, start, lexeme)
            else:
                value = int(lexeme)
                return self._make_token(TokenType.INT_LITERAL, value, start, lexeme)
        except ValueError:
            raise error_invalid_number_literal(
                lexeme, self._span(start), self.get_source_line(start.line)
            )

    def _scan_hex_number(self, start: SourceLocation) -> Token:
        """Scan a hexadecimal integer literal (0x...)."""
        self._advance()  # consume 'x' or 'X'

        if not self._peek() in '0123456789abcdefABCDEF':
            lexeme = self.source[start.offset:self.pos]
            raise error_invalid_hex_literal(
                lexeme, self._span(start), self.get_source_line(start.line)
            )

        while self._peek() in '0123456789abcdefABCDEF_':
            if self._peek() != '_':  # Allow underscores as separators
                pass
            self._advance()

        lexeme = self.source[start.offset:self.pos]
        clean_lexeme = lexeme.replace('_', '')
        try:
            value = int(clean_lexeme, 16)
            return self._make_token(TokenType.INT_LITERAL, value, start, lexeme)
        except ValueError:
            raise error_invalid_hex_literal(
                lexeme, self._span(start), self.get_source_line(start.line)
            )

    def _scan_binary_number(self, start: SourceLocation) -> Token:
        """Scan a binary integer literal (0b...)."""
        self._advance()  # consume 'b' or 'B'

        if not self._peek() in '01':
            lexeme = self.source[start.offset:self.pos]
            raise error_invalid_binary_literal(
                lexeme, self._span(start), self.get_source_line(start.line)
            )

        while self._peek() in '01_':
            self._advance()

        lexeme = self.source[start.offset:self.pos]
        clean_lexeme = lexeme.replace('_', '')
        try:
            value = int(clean_lexeme, 2)
            return self._make_token(TokenType.INT_LITERAL, value, start, lexeme)
        except ValueError:
            raise error_invalid_binary_literal(
                lexeme, self._span(start), self.get_source_line(start.line)
            )

    def _scan_identifier_or_keyword(self) -> Token:
        """Scan an identifier or keyword."""
        start = self._location()

        while self._peek().isalnum() or self._peek() == '_':
            self._advance()

        lexeme = self.source[start.offset:self.pos]

        # Check if it's a keyword
        if lexeme in KEYWORDS:
            token_type = KEYWORDS[lexeme]
            # Handle boolean literals specially (True/False and true/false)
            if lexeme in ('True', 'False', 'true', 'false'):
                value = lexeme in ('True', 'true')
            else:
                value = lexeme
            return self._make_token(token_type, value, start, lexeme)

        # It's an identifier
        return self._make_token(TokenType.IDENTIFIER, lexeme, start, lexeme)

    def _scan_token(self) -> Token:
        """Scan the next token."""
        # Return any pending tokens first (from DEDENT sequences)
        if self.pending_tokens:
            return self.pending_tokens.pop(0)

        # Handle line start indentation
        indent_token = self._handle_line_start()
        if indent_token:
            return indent_token

        # Skip horizontal whitespace
        self._skip_whitespace_within_line()

        # Check for comments (but not at line start, handled above)
        while self._peek() == '#' or (self._peek() == '/' and self._peek(1) == '*'):
            if self._peek() == '#':
                self._skip_comment()
            else:
                self._skip_multiline_comment()
            self._skip_whitespace_within_line()

        # Check for newline
        if self._peek() == '\n':
            start = self._location()
            self._advance()
            # Only emit NEWLINE if not inside brackets
            if self.bracket_depth == 0:
                # Handle indentation for next line
                return self._make_token(TokenType.NEWLINE, None, start, "\\n")
            else:
                # Inside brackets - skip newline, handle indentation
                return self._scan_token()

        if self._is_at_end():
            # At EOF, emit any remaining DEDENT tokens
            start = self._location()
            while len(self.indent_stack) > 1:
                self.indent_stack.pop()
                self.pending_tokens.append(
                    self._make_token(TokenType.DEDENT, None, start, "")
                )
            if self.pending_tokens:
                return self.pending_tokens.pop(0)
            return self._make_token(TokenType.EOF, None, self._location(), "")

        start = self._location()
        ch = self._peek()

        # String literals
        if ch in '"\'':
            return self._scan_string()

        # Numbers
        if ch.isdigit():
            return self._scan_number()

        # Identifiers and keywords
        if ch.isalpha() or ch == '_':
            return self._scan_identifier_or_keyword()

        # Single-character tokens first (advance after match)
        self._advance()

        # Two-character operators
        if ch == '-' and self._match('>'):
            return self._make_token(TokenType.ARROW, "->", start)
        if ch == '=' and self._match('>'):
            return self._make_token(TokenType.DOUBLE_ARROW, "=>", start)
        if ch == '=' and self._match('='):
            return self._make_token(TokenType.EQ, "==", start)
        if ch == '!' and self._match('='):
            return self._make_token(TokenType.NE, "!=", start)
        if ch == '<' and self._match('='):
            return self._make_token(TokenType.LE, "<=", start)
        if ch == '>' and self._match('='):
            return self._make_token(TokenType.GE, ">=", start)
        if ch == '.' and self._match('.'):
            return self._make_token(TokenType.RANGE, "..", start)
        if ch == '*' and self._match('*'):
            return self._make_token(TokenType.DOUBLE_STAR, "**", start)
        if ch == '/' and self._match('/'):
            return self._make_token(TokenType.DOUBLE_SLASH, "//", start)
        if ch == '+' and self._match('='):
            return self._make_token(TokenType.PLUS_ASSIGN, "+=", start)
        if ch == '-' and self._match('='):
            return self._make_token(TokenType.MINUS_ASSIGN, "-=", start)

        # Track bracket depth for implicit line continuation
        if ch == '(':
            self.bracket_depth += 1
            return self._make_token(TokenType.LPAREN, ch, start)
        if ch == ')':
            self.bracket_depth = max(0, self.bracket_depth - 1)
            return self._make_token(TokenType.RPAREN, ch, start)
        if ch == '[':
            self.bracket_depth += 1
            return self._make_token(TokenType.LBRACKET, ch, start)
        if ch == ']':
            self.bracket_depth = max(0, self.bracket_depth - 1)
            return self._make_token(TokenType.RBRACKET, ch, start)
        if ch == '{':
            self.bracket_depth += 1
            return self._make_token(TokenType.LBRACE, ch, start)
        if ch == '}':
            self.bracket_depth = max(0, self.bracket_depth - 1)
            return self._make_token(TokenType.RBRACE, ch, start)

        # Single-character tokens
        single_char_tokens = {
            '+': TokenType.PLUS,
            '-': TokenType.MINUS,
            '*': TokenType.STAR,
            '/': TokenType.SLASH,
            '%': TokenType.PERCENT,
            '<': TokenType.LT,
            '>': TokenType.GT,
            '=': TokenType.ASSIGN,
            ':': TokenType.COLON,
            ';': TokenType.SEMICOLON,  # Deprecated but kept for error messages
            ',': TokenType.COMMA,
            '.': TokenType.DOT,
            '?': TokenType.QUESTION,
            '@': TokenType.AT,
            '#': TokenType.HASH,  # Usually not reached (comments)
        }

        if ch in single_char_tokens:
            return self._make_token(single_char_tokens[ch], ch, start)

        # Unknown character
        raise error_unexpected_character(
            ch, self._span(start), self.get_source_line(start.line)
        )


[docs]
    def tokenize(self) -> List[Token]:
        """Tokenize the entire source, returning a list of tokens."""
        tokens = []
        while True:
            token = self._scan_token()
            tokens.append(token)
            if token.type == TokenType.EOF:
                break
        return tokens


    def __iter__(self) -> Iterator[Token]:
        """Iterate over tokens."""
        while True:
            token = self._scan_token()
            yield token
            if token.type == TokenType.EOF:
                break




[docs]
def tokenize(source: str, filename: Optional[str] = None) -> List[Token]:
    """
    Convenience function to tokenize source code.

    Args:
        source: The source code to tokenize
        filename: Optional filename for error messages

    Returns:
        List of tokens

    Raises:
        LexerError: If tokenization fails
    """
    lexer = Lexer(source, filename)
    return lexer.tokenize()