boolean/tokenizer.go

321 lines
7.2 KiB
Go

package boolean
import (
"bufio"
"io"
"fmt"
"errors"
"slices"
"log/slog"
)
type TokenType int
type ExprType int
const (
EXPR_NUMERIC_CONSTANT ExprType = iota
EXPR_BOOLEAN_CONSTNAT
TOKEN_ERR TokenType = iota
TOKEN_UNARY_OPERATOR
TOKEN_BINARY_OPERATOR
TOKEN_AMBIGUOUS_OPERATOR
TOKEN_LEFT_PAREN
TOKEN_RIGHT_PAREN
TOKEN_SYMBOL
TOKEN_NUMERIC_LITERAL
TOKEN_BOOLEAN_LITERAL
)
type Token struct {
Text string
Type TokenType
Index uint
}
type TokenStream struct {
Tokens []Token
Position int
}
func (stream *TokenStream) Peek(n int) ([]Token, error) {
if stream.Position >= len(stream.Tokens) {
return nil, io.EOF
}
return stream.Tokens[stream.Position:stream.Position+n], nil
}
func (stream *TokenStream) Read(n int) ([]Token, error) {
if stream.Position >= len(stream.Tokens) {
return nil, io.EOF
}
ret := stream.Tokens[stream.Position:stream.Position+n]
stream.Position += len(ret)
return ret, nil
}
func NewTokenStream(tokens []Token) *TokenStream {
return &TokenStream{
Tokens: tokens,
Position: 0,
}
}
type OperatorTree struct {
TokenType
Next map[byte]OperatorTree
}
func SymbolValidStart(char byte) bool {
if char >= 97 && char <= 122 {
return true
} else if char >= 65 && char <= 90 {
return true
} else if char == '_' {
return true
}
return false
}
func SymbolValidCont(char byte) bool {
if SymbolValidStart(char) {
return true
} else if char >= 48 && char <= 57 {
return true
} else if char == '-' {
return true
}
return false
}
func NumericValid(char byte) bool {
if char >= 48 && char <= 57 {
return true
}
return false
}
var (
Whitepace = []byte{' ', '\t', '\n', '\r'}
OperatorTokens = map[byte]OperatorTree{
'.': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
'~': {
TokenType: TOKEN_UNARY_OPERATOR,
Next: nil,
},
'&': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
'|': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
'^': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
'+': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
'*': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: map[byte]OperatorTree{
'*': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
},
},
'/': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
'-': {
TokenType: TOKEN_AMBIGUOUS_OPERATOR,
Next: nil,
},
'!': {
TokenType: TOKEN_ERR,
Next: map[byte]OperatorTree{
'=': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
},
},
'=': {
TokenType: TOKEN_ERR,
Next: map[byte]OperatorTree{
'=': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
},
},
'>': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: map[byte]OperatorTree{
'=': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
},
},
'<': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: map[byte]OperatorTree{
'=': {
TokenType: TOKEN_BINARY_OPERATOR,
Next: nil,
},
},
},
}
)
func TokenizeOperator(stream *bufio.Reader, node *OperatorTree, current string) (TokenType, string, error) {
next_chars, err := stream.Peek(1)
if errors.Is(err, io.EOF) {
return node.TokenType, current, nil
} else if err != nil {
return TOKEN_ERR, current, fmt.Errorf("TokenizeOperator peek error: %w", err)
}
next_node, continues := node.Next[next_chars[0]]
if continues == true {
_, err := stream.ReadByte()
if err != nil {
return TOKEN_ERR, current, fmt.Errorf("TokenizeOperator consume error: %w", err)
}
return TokenizeOperator(stream, &next_node, current + string(next_chars))
} else {
return node.TokenType, current, nil
}
}
func Tokenize(stream *bufio.Reader) (*TokenStream, error) {
tokens := []Token{}
var position uint = 0
for true {
char, err := stream.ReadByte()
if errors.Is(err, io.EOF) {
break
} else if err != nil {
return nil, fmt.Errorf("tokenize read error: %w", err)
}
if slices.Contains(Whitepace, char) {
slog.Debug("tokenizer", "whitespace", char)
position += 1
continue
} else if node, is_operator := OperatorTokens[char]; is_operator == true{
token_type, string, err := TokenizeOperator(stream, &node, string(char))
if err != nil {
return nil, err
}
slog.Debug("tokenizer", "operator", string)
tokens = append(tokens, Token{
Type: token_type,
Text: string,
Index: position,
})
position += uint(len(string))
} else if NumericValid(char) {
literal := string(char)
decimal := false
for true {
next_chars, err := stream.Peek(1)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
return nil, fmt.Errorf("numeric peek error: %w", err)
} else if NumericValid(next_chars[0]) {
_, err := stream.ReadByte()
if err != nil {
return nil, fmt.Errorf("numeric read error: %w", err)
}
literal += string(next_chars)
} else if next_chars[0] == '.' {
if decimal == true {
break
}
decimal = true
_, err := stream.ReadByte()
if err != nil {
return nil, fmt.Errorf("numeric read error: %w", err)
}
literal += string(next_chars)
} else {
break
}
}
slog.Debug("tokenizer", "numeric", literal)
tokens = append(tokens, Token{
Type: TOKEN_NUMERIC_LITERAL,
Text: literal,
Index: position,
})
position += uint(len(literal))
} else if SymbolValidStart(char) {
symbol := string(char)
for true {
next_chars, err := stream.Peek(1)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
return nil, fmt.Errorf("symbol peek error: %w", err)
} else if SymbolValidCont(next_chars[0]) == true {
_, err := stream.ReadByte()
if err != nil {
return nil, fmt.Errorf("symbol read error: %w", err)
}
symbol += string(next_chars)
} else {
break
}
}
slog.Debug("tokenizer", "symbol", symbol)
token_type := TOKEN_SYMBOL
switch symbol {
case "TRUE":
token_type = TOKEN_BOOLEAN_LITERAL
case "FALSE":
token_type = TOKEN_BOOLEAN_LITERAL
}
tokens = append(tokens, Token{
Type: token_type,
Text: symbol,
Index: position,
})
position += uint(len(symbol))
} else {
switch char {
case '(':
tokens = append(tokens, Token{
Type: TOKEN_LEFT_PAREN,
Text: "(",
Index: position,
})
position += 1
case ')':
tokens = append(tokens, Token{
Type: TOKEN_RIGHT_PAREN,
Text: ")",
Index: position,
})
position += 1
default:
return nil, fmt.Errorf("tokenize unexpected character: %c", char)
}
}
}
return NewTokenStream(tokens), nil
}