"""
Lexer for the yapCAD DSL v2 (Pythonic Syntax).
Converts source text into a stream of tokens for the parser.
Supports:
- Python-style indentation (INDENT/DEDENT tokens)
- Significant newlines (NEWLINE tokens)
- Implicit line continuation inside brackets
- Single-line comments (#)
- Multi-line comments (/* */)
- String literals with escape sequences
- Multi-line strings (triple quotes)
- Integer literals (decimal, hex, binary)
- Float literals (including scientific notation)
- All DSL keywords and operators
"""
from typing import List, Optional, Iterator
from .tokens import (
Token, TokenType, SourceLocation, SourceSpan, KEYWORDS,
is_deprecated_keyword, get_deprecation_message
)
from .errors import (
LexerError,
error_unexpected_character,
error_unterminated_string,
error_unterminated_multiline_string,
error_unterminated_comment,
error_invalid_escape_sequence,
error_invalid_number_literal,
error_invalid_hex_literal,
error_invalid_binary_literal,
)
[docs]
class Lexer:
"""
Tokenizer for the yapCAD DSL v2 with Python-style indentation.
This lexer generates INDENT and DEDENT tokens based on changes in
leading whitespace, similar to Python's tokenizer.
Usage:
lexer = Lexer(source_code)
tokens = lexer.tokenize()
Or for streaming:
lexer = Lexer(source_code)
for token in lexer:
process(token)
"""
def __init__(self, source: str, filename: Optional[str] = None):
self.source = source
self.filename = filename
self.pos = 0 # Current position in source
self.line = 1 # Current line (1-indexed)
self.column = 1 # Current column (1-indexed)
self.line_start = 0 # Position of current line start
self._lines: Optional[List[str]] = None # Cached line list
# Indentation tracking
self.indent_stack = [0] # Stack of indentation levels (starts at 0)
self.at_line_start = True # Are we at the start of a line?
self.pending_tokens: List[Token] = [] # Tokens to emit before next scan
# Bracket nesting for implicit line continuation
self.bracket_depth = 0 # Count of (, [, { nesting
@property
def lines(self) -> List[str]:
"""Lazy-load line list for error reporting."""
if self._lines is None:
self._lines = self.source.splitlines()
return self._lines
[docs]
def get_source_line(self, line_num: int) -> Optional[str]:
"""Get a specific line of source (1-indexed)."""
if 1 <= line_num <= len(self.lines):
return self.lines[line_num - 1]
return None
def _location(self) -> SourceLocation:
"""Get current source location."""
return SourceLocation(self.line, self.column, self.pos, self.filename)
def _span(self, start: SourceLocation) -> SourceSpan:
"""Create a span from start to current position."""
return SourceSpan(start, self._location())
def _peek(self, offset: int = 0) -> str:
"""Look at character at current position + offset without consuming."""
idx = self.pos + offset
if idx >= len(self.source):
return '\0'
return self.source[idx]
def _advance(self) -> str:
"""Consume and return current character."""
if self.pos >= len(self.source):
return '\0'
ch = self.source[self.pos]
self.pos += 1
if ch == '\n':
self.line += 1
self.column = 1
self.line_start = self.pos
self.at_line_start = True
else:
self.column += 1
return ch
def _match(self, expected: str) -> bool:
"""Consume character if it matches expected."""
if self._peek() == expected:
self._advance()
return True
return False
def _is_at_end(self) -> bool:
"""Check if we've reached end of source."""
return self.pos >= len(self.source)
def _skip_comment(self) -> None:
"""Skip a single-line comment (# to end of line)."""
while self._peek() != '\n' and not self._is_at_end():
self._advance()
def _skip_multiline_comment(self) -> None:
"""Skip /* ... */ comment."""
start = self._location()
self._advance() # consume '/'
self._advance() # consume '*'
depth = 1 # Support nested comments
while not self._is_at_end() and depth > 0:
if self._peek() == '/' and self._peek(1) == '*':
self._advance()
self._advance()
depth += 1
elif self._peek() == '*' and self._peek(1) == '/':
self._advance()
self._advance()
depth -= 1
else:
self._advance()
if depth > 0:
raise error_unterminated_comment(
self._span(start),
self.get_source_line(start.line)
)
def _handle_line_start(self) -> Optional[Token]:
"""
Handle indentation at the start of a line.
Returns a NEWLINE token if we have a significant newline,
and queues INDENT/DEDENT tokens as needed.
"""
if not self.at_line_start:
return None
# Calculate indentation (spaces and tabs)
# Note: We treat tabs as single indent units, but warn about mixing
start = self._location()
indent = 0
while self._peek() in ' \t':
if self._peek() == ' ':
indent += 1
else: # tab
indent += 1 # Simple approach: tab = 1 indent unit
self._advance()
# Skip blank lines and comment-only lines
if self._peek() == '\n' or self._peek() == '\0':
self.at_line_start = True
return None
if self._peek() == '#':
self._skip_comment()
return None
if self._peek() == '/' and self._peek(1) == '*':
self._skip_multiline_comment()
# After multiline comment, check if we're at line start again
return None
self.at_line_start = False
# Inside brackets, indentation doesn't matter
if self.bracket_depth > 0:
return None
current_indent = self.indent_stack[-1]
if indent > current_indent:
# Indent
self.indent_stack.append(indent)
return self._make_token(TokenType.INDENT, None, start, "")
elif indent < current_indent:
# Dedent - may need multiple DEDENT tokens
dedent_tokens = []
while self.indent_stack[-1] > indent:
self.indent_stack.pop()
dedent_tokens.append(
self._make_token(TokenType.DEDENT, None, start, "")
)
# Check for inconsistent dedent
if self.indent_stack[-1] != indent:
# Dedent to a level that was never indented to
# This is an error in Python, but we'll be lenient
self.indent_stack.append(indent)
if dedent_tokens:
# Queue all but the first
self.pending_tokens.extend(dedent_tokens[1:])
return dedent_tokens[0]
return None
def _skip_whitespace_within_line(self) -> None:
"""Skip horizontal whitespace (spaces and tabs), not newlines."""
while self._peek() in ' \t':
self._advance()
def _make_token(self, token_type: TokenType, value, start: SourceLocation,
lexeme: Optional[str] = None) -> Token:
"""Create a token."""
span = self._span(start)
if lexeme is None:
lexeme = self.source[start.offset:self.pos]
return Token(token_type, value, lexeme, span)
def _scan_string(self) -> Token:
"""Scan a string literal."""
start = self._location()
quote = self._advance() # consume opening quote
# Check for triple-quote (multi-line string)
if self._peek() == quote and self._peek(1) == quote:
self._advance() # consume second quote
self._advance() # consume third quote
return self._scan_multiline_string(start, quote)
# Single-line string
chars = []
while not self._is_at_end() and self._peek() != quote:
ch = self._peek()
if ch == '\n':
raise error_unterminated_string(
self._span(start),
self.get_source_line(start.line)
)
if ch == '\\':
self._advance() # consume backslash
chars.append(self._scan_escape_sequence(start))
else:
chars.append(self._advance())
if self._is_at_end():
raise error_unterminated_string(
self._span(start),
self.get_source_line(start.line)
)
self._advance() # consume closing quote
value = ''.join(chars)
return self._make_token(TokenType.STRING_LITERAL, value, start)
def _scan_multiline_string(self, start: SourceLocation, quote: str) -> Token:
"""Scan a triple-quoted multi-line string."""
chars = []
while not self._is_at_end():
if self._peek() == quote and self._peek(1) == quote and self._peek(2) == quote:
self._advance() # consume quotes
self._advance()
self._advance()
value = ''.join(chars)
return self._make_token(TokenType.STRING_LITERAL, value, start)
ch = self._peek()
if ch == '\\':
self._advance()
chars.append(self._scan_escape_sequence(start))
else:
chars.append(self._advance())
raise error_unterminated_multiline_string(
self._span(start),
self.get_source_line(start.line)
)
def _scan_escape_sequence(self, string_start: SourceLocation) -> str:
"""Parse an escape sequence after backslash."""
esc_start = self._location()
if self._is_at_end():
raise error_invalid_escape_sequence(
"", self._span(esc_start), self.get_source_line(esc_start.line)
)
ch = self._advance()
escape_chars = {
'n': '\n',
't': '\t',
'r': '\r',
'\\': '\\',
'"': '"',
"'": "'",
'0': '\0',
}
if ch in escape_chars:
return escape_chars[ch]
elif ch == 'x':
# Hex escape: \xHH
hex_chars = self._advance() + self._advance()
try:
return chr(int(hex_chars, 16))
except ValueError:
raise error_invalid_escape_sequence(
f"x{hex_chars}", self._span(esc_start),
self.get_source_line(esc_start.line)
)
elif ch == 'u':
# Unicode escape: \uHHHH
hex_chars = ''
for _ in range(4):
hex_chars += self._advance()
try:
return chr(int(hex_chars, 16))
except ValueError:
raise error_invalid_escape_sequence(
f"u{hex_chars}", self._span(esc_start),
self.get_source_line(esc_start.line)
)
else:
raise error_invalid_escape_sequence(
ch, self._span(esc_start), self.get_source_line(esc_start.line)
)
def _scan_number(self) -> Token:
"""Scan a numeric literal (int or float)."""
start = self._location()
first_char = self._peek()
# Check for hex or binary prefix
if first_char == '0':
self._advance()
if self._peek() in 'xX':
return self._scan_hex_number(start)
elif self._peek() in 'bB':
return self._scan_binary_number(start)
# Otherwise it's a decimal starting with 0
# Scan decimal digits
while self._peek().isdigit():
self._advance()
# Check for float (decimal point)
is_float = False
if self._peek() == '.' and self._peek(1).isdigit():
is_float = True
self._advance() # consume '.'
while self._peek().isdigit():
self._advance()
# Check for scientific notation
if self._peek() in 'eE':
is_float = True
self._advance() # consume 'e'
if self._peek() in '+-':
self._advance()
if not self._peek().isdigit():
lexeme = self.source[start.offset:self.pos]
raise error_invalid_number_literal(
lexeme, self._span(start), self.get_source_line(start.line)
)
while self._peek().isdigit():
self._advance()
lexeme = self.source[start.offset:self.pos]
try:
if is_float:
value = float(lexeme)
return self._make_token(TokenType.FLOAT_LITERAL, value, start, lexeme)
else:
value = int(lexeme)
return self._make_token(TokenType.INT_LITERAL, value, start, lexeme)
except ValueError:
raise error_invalid_number_literal(
lexeme, self._span(start), self.get_source_line(start.line)
)
def _scan_hex_number(self, start: SourceLocation) -> Token:
"""Scan a hexadecimal integer literal (0x...)."""
self._advance() # consume 'x' or 'X'
if not self._peek() in '0123456789abcdefABCDEF':
lexeme = self.source[start.offset:self.pos]
raise error_invalid_hex_literal(
lexeme, self._span(start), self.get_source_line(start.line)
)
while self._peek() in '0123456789abcdefABCDEF_':
if self._peek() != '_': # Allow underscores as separators
pass
self._advance()
lexeme = self.source[start.offset:self.pos]
clean_lexeme = lexeme.replace('_', '')
try:
value = int(clean_lexeme, 16)
return self._make_token(TokenType.INT_LITERAL, value, start, lexeme)
except ValueError:
raise error_invalid_hex_literal(
lexeme, self._span(start), self.get_source_line(start.line)
)
def _scan_binary_number(self, start: SourceLocation) -> Token:
"""Scan a binary integer literal (0b...)."""
self._advance() # consume 'b' or 'B'
if not self._peek() in '01':
lexeme = self.source[start.offset:self.pos]
raise error_invalid_binary_literal(
lexeme, self._span(start), self.get_source_line(start.line)
)
while self._peek() in '01_':
self._advance()
lexeme = self.source[start.offset:self.pos]
clean_lexeme = lexeme.replace('_', '')
try:
value = int(clean_lexeme, 2)
return self._make_token(TokenType.INT_LITERAL, value, start, lexeme)
except ValueError:
raise error_invalid_binary_literal(
lexeme, self._span(start), self.get_source_line(start.line)
)
def _scan_identifier_or_keyword(self) -> Token:
"""Scan an identifier or keyword."""
start = self._location()
while self._peek().isalnum() or self._peek() == '_':
self._advance()
lexeme = self.source[start.offset:self.pos]
# Check if it's a keyword
if lexeme in KEYWORDS:
token_type = KEYWORDS[lexeme]
# Handle boolean literals specially (True/False and true/false)
if lexeme in ('True', 'False', 'true', 'false'):
value = lexeme in ('True', 'true')
else:
value = lexeme
return self._make_token(token_type, value, start, lexeme)
# It's an identifier
return self._make_token(TokenType.IDENTIFIER, lexeme, start, lexeme)
def _scan_token(self) -> Token:
"""Scan the next token."""
# Return any pending tokens first (from DEDENT sequences)
if self.pending_tokens:
return self.pending_tokens.pop(0)
# Handle line start indentation
indent_token = self._handle_line_start()
if indent_token:
return indent_token
# Skip horizontal whitespace
self._skip_whitespace_within_line()
# Check for comments (but not at line start, handled above)
while self._peek() == '#' or (self._peek() == '/' and self._peek(1) == '*'):
if self._peek() == '#':
self._skip_comment()
else:
self._skip_multiline_comment()
self._skip_whitespace_within_line()
# Check for newline
if self._peek() == '\n':
start = self._location()
self._advance()
# Only emit NEWLINE if not inside brackets
if self.bracket_depth == 0:
# Handle indentation for next line
return self._make_token(TokenType.NEWLINE, None, start, "\\n")
else:
# Inside brackets - skip newline, handle indentation
return self._scan_token()
if self._is_at_end():
# At EOF, emit any remaining DEDENT tokens
start = self._location()
while len(self.indent_stack) > 1:
self.indent_stack.pop()
self.pending_tokens.append(
self._make_token(TokenType.DEDENT, None, start, "")
)
if self.pending_tokens:
return self.pending_tokens.pop(0)
return self._make_token(TokenType.EOF, None, self._location(), "")
start = self._location()
ch = self._peek()
# String literals
if ch in '"\'':
return self._scan_string()
# Numbers
if ch.isdigit():
return self._scan_number()
# Identifiers and keywords
if ch.isalpha() or ch == '_':
return self._scan_identifier_or_keyword()
# Single-character tokens first (advance after match)
self._advance()
# Two-character operators
if ch == '-' and self._match('>'):
return self._make_token(TokenType.ARROW, "->", start)
if ch == '=' and self._match('>'):
return self._make_token(TokenType.DOUBLE_ARROW, "=>", start)
if ch == '=' and self._match('='):
return self._make_token(TokenType.EQ, "==", start)
if ch == '!' and self._match('='):
return self._make_token(TokenType.NE, "!=", start)
if ch == '<' and self._match('='):
return self._make_token(TokenType.LE, "<=", start)
if ch == '>' and self._match('='):
return self._make_token(TokenType.GE, ">=", start)
if ch == '.' and self._match('.'):
return self._make_token(TokenType.RANGE, "..", start)
if ch == '*' and self._match('*'):
return self._make_token(TokenType.DOUBLE_STAR, "**", start)
if ch == '/' and self._match('/'):
return self._make_token(TokenType.DOUBLE_SLASH, "//", start)
if ch == '+' and self._match('='):
return self._make_token(TokenType.PLUS_ASSIGN, "+=", start)
if ch == '-' and self._match('='):
return self._make_token(TokenType.MINUS_ASSIGN, "-=", start)
# Track bracket depth for implicit line continuation
if ch == '(':
self.bracket_depth += 1
return self._make_token(TokenType.LPAREN, ch, start)
if ch == ')':
self.bracket_depth = max(0, self.bracket_depth - 1)
return self._make_token(TokenType.RPAREN, ch, start)
if ch == '[':
self.bracket_depth += 1
return self._make_token(TokenType.LBRACKET, ch, start)
if ch == ']':
self.bracket_depth = max(0, self.bracket_depth - 1)
return self._make_token(TokenType.RBRACKET, ch, start)
if ch == '{':
self.bracket_depth += 1
return self._make_token(TokenType.LBRACE, ch, start)
if ch == '}':
self.bracket_depth = max(0, self.bracket_depth - 1)
return self._make_token(TokenType.RBRACE, ch, start)
# Single-character tokens
single_char_tokens = {
'+': TokenType.PLUS,
'-': TokenType.MINUS,
'*': TokenType.STAR,
'/': TokenType.SLASH,
'%': TokenType.PERCENT,
'<': TokenType.LT,
'>': TokenType.GT,
'=': TokenType.ASSIGN,
':': TokenType.COLON,
';': TokenType.SEMICOLON, # Deprecated but kept for error messages
',': TokenType.COMMA,
'.': TokenType.DOT,
'?': TokenType.QUESTION,
'@': TokenType.AT,
'#': TokenType.HASH, # Usually not reached (comments)
}
if ch in single_char_tokens:
return self._make_token(single_char_tokens[ch], ch, start)
# Unknown character
raise error_unexpected_character(
ch, self._span(start), self.get_source_line(start.line)
)
[docs]
def tokenize(self) -> List[Token]:
"""Tokenize the entire source, returning a list of tokens."""
tokens = []
while True:
token = self._scan_token()
tokens.append(token)
if token.type == TokenType.EOF:
break
return tokens
def __iter__(self) -> Iterator[Token]:
"""Iterate over tokens."""
while True:
token = self._scan_token()
yield token
if token.type == TokenType.EOF:
break
[docs]
def tokenize(source: str, filename: Optional[str] = None) -> List[Token]:
"""
Convenience function to tokenize source code.
Args:
source: The source code to tokenize
filename: Optional filename for error messages
Returns:
List of tokens
Raises:
LexerError: If tokenization fails
"""
lexer = Lexer(source, filename)
return lexer.tokenize()