aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJP Appel <jeanpierre.appel01@gmail.com>2025-05-28 00:45:01 -0400
committerJP Appel <jeanpierre.appel01@gmail.com>2025-05-28 00:45:01 -0400
commitbff7aeafc3a61f933ab200afa789b74beab03e91 (patch)
tree15e71e38b12c0344297cb75922d1822882a845a5
parent417691f0fb4b1b53880c940ff3dbf885c1956151 (diff)
Add lexer for query dsl
-rw-r--r--pkg/query/lexer.go346
-rw-r--r--pkg/query/lexer_test.go71
2 files changed, 417 insertions, 0 deletions
diff --git a/pkg/query/lexer.go b/pkg/query/lexer.go
new file mode 100644
index 0000000..db7ea28
--- /dev/null
+++ b/pkg/query/lexer.go
@@ -0,0 +1,346 @@
+package query
+
+import (
+ "fmt"
+ "regexp"
+ "slices"
+ "strings"
+)
+
+type queryTokenType int
+
+var lexRegex, oldLexRegex *regexp.Regexp
+
+const (
+ TOK_UNKNOWN queryTokenType = iota
+
+ // clause tokens
+ TOK_CLAUSE_OR // clause or
+ TOK_CLAUSE_AND // clause and
+ TOK_CLAUSE_START
+ TOK_CLAUSE_END
+
+ // statement tokens
+ TOK_OP_NEG // negation
+ TOK_OP_EQ // equal
+ TOK_OP_AP // approximate/fuzzy
+ TOK_OP_NE // not equal
+ TOK_OP_LT // less than
+ TOK_OP_LE // less than or equal
+ TOK_OP_GE // greater than or equal
+ TOK_OP_GT // greaterthan
+ TOK_OP_PIPE // external pipe
+ TOK_OP_ARG // external arg
+ // categories
+ TOK_CAT_TITLE
+ TOK_CAT_AUTHOR
+ TOK_CAT_DATE
+ TOK_CAT_FILETIME
+ TOK_CAT_TAGS
+ TOK_CAT_LINKS
+ TOK_CAT_META
+ // values
+ TOK_VAL_STR
+ TOK_VAL_DATETIME
+)
+
+type Token struct {
+ Type queryTokenType
+ Value string
+}
+
+func (tokType queryTokenType) String() string {
+ switch tokType {
+ case TOK_UNKNOWN:
+ return "Unknown"
+ case TOK_CLAUSE_OR:
+ return "Or"
+ case TOK_CLAUSE_AND:
+ return "And"
+ case TOK_CLAUSE_START:
+ return "Start Clause"
+ case TOK_CLAUSE_END:
+ return "End Clause"
+ case TOK_OP_NEG:
+ return "Negation"
+ case TOK_OP_EQ:
+ return "Equal"
+ case TOK_OP_AP:
+ return "Approximate"
+ case TOK_OP_NE:
+ return "Not Equal"
+ case TOK_OP_LT:
+ return "Less Than"
+ case TOK_OP_LE:
+ return "Less Than or Equal"
+ case TOK_OP_GE:
+ return "Greater Than or Equal"
+ case TOK_OP_GT:
+ return "Greater Than"
+ case TOK_OP_PIPE:
+ return "Pipe External Command"
+ case TOK_OP_ARG:
+ return "Argument External Command"
+ case TOK_CAT_TITLE:
+ return "Title Category"
+ case TOK_CAT_AUTHOR:
+ return "Author Category"
+ case TOK_CAT_DATE:
+ return "Date Category"
+ case TOK_CAT_FILETIME:
+ return "Filetime Category"
+ case TOK_CAT_TAGS:
+ return "Tags Category"
+ case TOK_CAT_LINKS:
+ return "Links Category"
+ case TOK_CAT_META:
+ return "Metadata Category"
+ case TOK_VAL_DATETIME:
+ return "Datetime Value"
+ case TOK_VAL_STR:
+ return "String Value"
+ default:
+ return "Invalid"
+ }
+}
+
+func (t Token) String() string {
+ return fmt.Sprint(t.Type.String(), ": ", t.Value)
+}
+
+func (t Token) Equal(other Token) bool {
+ if t.Type.isValue() {
+ return t.Type == other.Type && t.Value == other.Value
+ }
+ return t.Type == other.Type
+}
+
+// if a token type is one of any
+func (tokType queryTokenType) Any(expected ...queryTokenType) bool {
+ return slices.Contains(expected, tokType)
+}
+
+func (t queryTokenType) isClause() bool {
+ return t == TOK_CLAUSE_OR || t == TOK_CLAUSE_AND || t == TOK_CLAUSE_START || t == TOK_CLAUSE_END
+}
+
+func (t queryTokenType) isCategory() bool {
+ return t.Any(TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_DATE, TOK_CAT_FILETIME, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META)
+}
+func (t queryTokenType) isOperation() bool {
+ return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT, TOK_OP_PIPE, TOK_OP_ARG)
+}
+func (t queryTokenType) isValue() bool {
+ return t == TOK_VAL_STR || t == TOK_VAL_DATETIME
+}
+
+func Lex(query string) []Token {
+ const (
+ MATCH = iota
+ CLAUSE_START
+ CLAUSE_OPERATOR
+ STATEMENT
+ NEGATION
+ CATEGORY
+ OPERATOR
+ VALUE
+ UNKNOWN
+ CLAUSE_END
+ )
+
+ matches := lexRegex.FindAllStringSubmatch(query, -1)
+ tokens := make([]Token, 0, 4*len(matches))
+
+ tokens = append(tokens, Token{Type: TOK_CLAUSE_START})
+ tokens = append(tokens, Token{TOK_CLAUSE_AND, "and"}) // default to and'ing all args
+ clauseLevel := 1
+ for i, match := range matches {
+ if match[CLAUSE_START] != "" {
+ tokens = append(tokens, Token{Type: TOK_CLAUSE_START})
+ // TODO: set maximum nest level
+ clauseLevel += 1
+ }
+ if match[CLAUSE_OPERATOR] != "" {
+ if i == 0 || tokens[i-1].Type != TOK_CLAUSE_START {
+ tokens = append(tokens, Token{Type: TOK_CLAUSE_START})
+ clauseLevel += 1
+ }
+ tokens = append(tokens, tokenizeClauseOperation(match[CLAUSE_OPERATOR]))
+ }
+
+ if t, ok := tokenizeNegation(match[NEGATION]); ok {
+ tokens = append(tokens, t)
+ }
+
+ if match[CATEGORY] != "" {
+ tokens = append(tokens, tokenizeCategory(match[CATEGORY]))
+ }
+ if match[OPERATOR] != "" {
+ tokens = append(tokens, tokenizeOperation(match[OPERATOR]))
+ }
+ if match[VALUE] != "" {
+ tokens = append(tokens, tokenizeValue(match[VALUE], tokens[len(tokens)-2].Type))
+ }
+
+ if match[UNKNOWN] != "" {
+ tokens = append(tokens, Token{Value: match[UNKNOWN]})
+ }
+
+ if match[CLAUSE_END] != "" {
+ tokens = append(tokens, Token{Type: TOK_CLAUSE_END})
+ // TODO: raise err if clauseLevel is 0
+ clauseLevel -= 1
+ }
+ }
+
+ for range clauseLevel {
+ tokens = append(tokens, Token{Type: TOK_CLAUSE_END})
+ }
+
+ return tokens
+}
+
+func tokenizeClauseOperation(s string) Token {
+ t := Token{Value: s}
+ switch s {
+ case "and", "AND":
+ t.Type = TOK_CLAUSE_AND
+ case "or", "OR":
+ t.Type = TOK_CLAUSE_OR
+ }
+ return t
+}
+
+func tokenizeNegation(s string) (Token, bool) {
+ t := Token{Value: s}
+ if s == "-" {
+ t.Type = TOK_OP_NEG
+ }
+
+ return t, len(s) > 0
+}
+
+func tokenizeOperation(s string) Token {
+ t := Token{Value: s}
+ switch s {
+ case "!=":
+ t.Type = TOK_OP_NE
+ case "!+":
+ t.Type = TOK_OP_ARG
+ case "<=":
+ t.Type = TOK_OP_LE
+ case ">=":
+ t.Type = TOK_OP_GE
+ case "=":
+ t.Type = TOK_OP_EQ
+ case ":", "~":
+ t.Type = TOK_OP_AP
+ case "<":
+ t.Type = TOK_OP_LT
+ case ">":
+ t.Type = TOK_OP_GT
+ case "!":
+ t.Type = TOK_OP_PIPE
+ }
+
+ return t
+}
+
+func tokenizeCategory(s string) Token {
+ t := Token{Value: s}
+ switch s {
+ case "T", "title":
+ t.Type = TOK_CAT_TITLE
+ case "a", "author":
+ t.Type = TOK_CAT_AUTHOR
+ case "d", "date":
+ t.Type = TOK_CAT_DATE
+ case "f", "filetime":
+ t.Type = TOK_CAT_FILETIME
+ case "t", "tags":
+ t.Type = TOK_CAT_TAGS
+ case "l", "links":
+ t.Type = TOK_CAT_LINKS
+ case "m", "meta":
+ t.Type = TOK_CAT_META
+ }
+ return t
+}
+
+func tokenizeValue(s string, catType queryTokenType) Token {
+ t := Token{}
+ if len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"' {
+ t.Value = s[1 : len(s)-1]
+ } else {
+ t.Value = s
+ }
+ switch catType {
+ case TOK_CAT_DATE, TOK_CAT_FILETIME:
+ t.Type = TOK_VAL_DATETIME
+ case TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META:
+ t.Type = TOK_VAL_STR
+ }
+ return t
+}
+
+func treeStringify(tokens []Token) string {
+ b := strings.Builder{}
+
+ indentLvl := 0
+ writeIndent := func(level int) {
+ for range level {
+ b.WriteString("\t")
+ }
+ }
+ writeToken := func(t Token) {
+ b.WriteByte('`')
+ b.WriteString(t.String())
+ b.WriteByte('`')
+ }
+
+ for i, token := range tokens {
+ switch token.Type {
+ case TOK_CLAUSE_START:
+ writeIndent(indentLvl)
+ b.WriteByte('(')
+ case TOK_CLAUSE_END:
+ indentLvl -= 1
+ writeIndent(indentLvl)
+ b.WriteString(")\n")
+ case TOK_CLAUSE_OR:
+ b.WriteString("or\n")
+ indentLvl += 1
+ case TOK_CLAUSE_AND:
+ b.WriteString("and\n")
+ indentLvl += 1
+ case TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_DATE, TOK_CAT_FILETIME, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META, TOK_OP_NEG:
+ if i == 0 || tokens[i-1].Type != TOK_OP_NEG {
+ writeIndent(indentLvl)
+ }
+ writeToken(token)
+ case TOK_VAL_STR, TOK_VAL_DATETIME, TOK_UNKNOWN:
+ writeToken(token)
+ b.WriteByte('\n')
+ default:
+ writeToken(token)
+ }
+ }
+
+ return b.String()
+}
+
+func init() {
+ negPattern := `(?<negation>-?)`
+ categoryPattern := `(?<category>T|a(?:uthor)?|d(?:ate)?|f(?:iletime)?|t(?:ags|itle)?|l(?:inks)?|m(?:eta)?)`
+ opPattern := `(?<operator>!=|!+|<=|>=|=|:|~|<|>|!)`
+ valPattern := `(?<value>".*?"|\S*[^\s\)])`
+ statementPattern := `(?<statement>` + negPattern + categoryPattern + opPattern + valPattern + `)`
+ unknownPattern := `(?<unknown>\S*".*?"[^\s)]*|\S*[^\s\)])`
+
+ clauseOpPattern := `(?<clause_operator>(?i)and|or)?`
+ clauseStart := `(?<clause_start>\()?`
+ clauseEnd := `(?<clause_end>\))?`
+ clausePattern := clauseStart + `\s*` + clauseOpPattern + `\s*(?:` + statementPattern + `|` + unknownPattern + `)\s*` + clauseEnd
+
+ lexRegex = regexp.MustCompile(clausePattern)
+}
diff --git a/pkg/query/lexer_test.go b/pkg/query/lexer_test.go
new file mode 100644
index 0000000..761055e
--- /dev/null
+++ b/pkg/query/lexer_test.go
@@ -0,0 +1,71 @@
+package query
+
+import (
+ "testing"
+)
+
+func TestLex(t *testing.T) {
+ tests := []struct {
+ name string
+ query string
+ want []Token
+ }{
+ {"empty query", "", []Token{{Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"}, {Type: TOK_CLAUSE_END}}},
+ {"quoted statement", `a:"ken thompson"`, []Token{
+ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
+ {TOK_CAT_AUTHOR, "a"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "ken thompson"},
+ {Type: TOK_CLAUSE_END},
+ }},
+ {"invalid token", `foo:bar`, []Token{
+ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
+ {TOK_UNKNOWN, "foo:bar"},
+ {Type: TOK_CLAUSE_END},
+ }},
+ {"simple query", "a:a t:b d:01010001", []Token{
+ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
+ {TOK_CAT_AUTHOR, "a"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "a"},
+ {TOK_CAT_TAGS, "t"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "b"},
+ {TOK_CAT_DATE, "d"}, {TOK_OP_AP, ":"}, {TOK_VAL_DATETIME, "01010001"},
+ {Type: TOK_CLAUSE_END},
+ }},
+ {"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo", []Token{
+ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
+ {TOK_CAT_AUTHOR, "a"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "a"},
+ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_OR, "or"},
+ {TOK_CAT_TAGS, "t"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "b"},
+ {TOK_CAT_TAGS, "t"}, {TOK_OP_NE, "!="}, {TOK_VAL_STR, "c"},
+ {Type: TOK_CLAUSE_END},
+ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_OR, "or"},
+ {TOK_CAT_DATE, "d"}, {TOK_OP_LE, "<="}, {TOK_VAL_DATETIME, "01010001"},
+ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
+ {TOK_OP_NEG, "-"}, {TOK_CAT_TITLE, "T"}, {TOK_OP_AP, "~"}, {TOK_VAL_STR, "foo"},
+ {Type: TOK_CLAUSE_END},
+ {Type: TOK_CLAUSE_END},
+ {Type: TOK_CLAUSE_END},
+ }},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := Lex(tt.query)
+
+ gl, wl := len(got), len(tt.want)
+ if gl != wl {
+ t.Errorf("Got %d tokens wanted %d\n", gl, wl)
+ }
+
+ for i := range min(gl, wl) {
+ gt, wt := got[i], tt.want[i]
+ if !gt.Equal(wt) {
+ t.Errorf("Got different token than wanted at %d\n", i)
+ t.Logf("(%v) != (%v)\n", gt.String(), wt.String())
+ break
+ }
+ }
+
+ if t.Failed() {
+ t.Log("Got\n", treeStringify(got))
+ t.Log("Want\n", treeStringify(tt.want))
+ }
+ })
+ }
+}