From 3b8dcd30f5aca7624a22cff85a2f767d8d1fb583 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Mon, 28 Jul 2025 01:20:01 -0400 Subject: Add regex operator Implemented regex operator using go flavored regular expressions. Added optimization to combine regex's in `OR` clauses. --- pkg/query/compiler.go | 26 ++++++++++++++++ pkg/query/lexer.go | 21 ++++++++----- pkg/query/lexer_test.go | 4 ++- pkg/query/optimizer.go | 63 ++++++++++++++++++++++++++++++++++++++- pkg/query/optimizer_test.go | 72 +++++++++++++++++++++++++++++++++++++++++++++ pkg/query/parser.go | 45 +++++++++++++--------------- pkg/query/parser_test.go | 1 + 7 files changed, 199 insertions(+), 33 deletions(-) (limited to 'pkg/query') diff --git a/pkg/query/compiler.go b/pkg/query/compiler.go index 1e946a6..7000544 100644 --- a/pkg/query/compiler.go +++ b/pkg/query/compiler.go @@ -93,6 +93,8 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error case OP_LT: // NOTE: doesn't raise compiler error if operator used on invalid category opStr = "< " + case OP_RE: + opStr = "REGEXP " case OP_NE: if cat.IsSet() { opStr = "NOT IN " @@ -164,6 +166,30 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error idx++ sCount++ } + } else if op == OP_RE { + idx := 0 + for _, stmt := range opStmts { + b.WriteString("( ") + b.WriteString(catStr) + b.WriteString("IS NOT NULL AND ") + if stmt.Negated { + b.WriteString("NOT ") + } + b.WriteString(catStr) + b.WriteString(opStr) + arg, ok := stmt.Value.buildCompile(b) + b.WriteString(" )") + if ok { + args = append(args, arg) + } + b.WriteByte(' ') + if idx != len(opStmts)-1 { + b.WriteString(delim) + b.WriteByte(' ') + } + idx++ + sCount++ + } } else { idx := 0 for _, stmt := range opStmts { diff --git a/pkg/query/lexer.go b/pkg/query/lexer.go index 79b6f0f..de344b7 100644 --- a/pkg/query/lexer.go +++ b/pkg/query/lexer.go @@ -29,7 +29,8 @@ const ( TOK_OP_LT // less than TOK_OP_LE // less than or equal TOK_OP_GE // greater than or equal - TOK_OP_GT // greaterthan + TOK_OP_GT // greater than + TOK_OP_RE // regex match // categories TOK_CAT_PATH TOK_CAT_TITLE @@ -67,6 +68,8 @@ func (tokType queryTokenType) String() string { return "Equal" case TOK_OP_AP: return "Approximate" + case TOK_OP_RE: + return "Regular Expression" case TOK_OP_NE: return "Not Equal" case TOK_OP_LT: @@ -118,16 +121,18 @@ func (tokType queryTokenType) Any(expected ...queryTokenType) bool { return slices.Contains(expected, tokType) } -func (t queryTokenType) isClause() bool { - return t == TOK_CLAUSE_OR || t == TOK_CLAUSE_AND || t == TOK_CLAUSE_START || t == TOK_CLAUSE_END -} - func (t queryTokenType) isCategory() bool { return t.Any(TOK_CAT_PATH, TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_DATE, TOK_CAT_FILETIME, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META) } -func (t queryTokenType) isOperation() bool { + +func (t queryTokenType) isDateOperation() bool { return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT) } + +func (t queryTokenType) isStringOperation() bool { + return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_RE) +} + func (t queryTokenType) isValue() bool { return t == TOK_VAL_STR || t == TOK_VAL_DATETIME } @@ -233,6 +238,8 @@ func tokenizeOperation(s string) Token { t.Type = TOK_OP_LT case ">": t.Type = TOK_OP_GT + case "!re!": + t.Type = TOK_OP_RE } return t @@ -321,7 +328,7 @@ func TokensStringify(tokens []Token) string { func init() { negPattern := `(?-?)` categoryPattern := `(?T|p(?:ath)?|a(?:uthor)?|d(?:ate)?|f(?:iletime)?|t(?:ags|itle)?|l(?:inks)?|m(?:eta)?)` - opPattern := `(?!=|<=|>=|=|:|~|<|>)` + opPattern := `(?!re!|!=|<=|>=|=|:|~|<|>)` valPattern := `(?".*?"|\S*[^\s\)])` statementPattern := `(?` + negPattern + categoryPattern + opPattern + valPattern + `)` unknownPattern := `(?\S*".*?"[^\s)]*|\S*[^\s\)])` diff --git a/pkg/query/lexer_test.go b/pkg/query/lexer_test.go index 5888902..35bdc4f 100644 --- a/pkg/query/lexer_test.go +++ b/pkg/query/lexer_test.go @@ -22,6 +22,7 @@ const ( TOK_OP_LE = query.TOK_OP_LE TOK_OP_GE = query.TOK_OP_GE TOK_OP_GT = query.TOK_OP_GT + TOK_OP_RE = query.TOK_OP_RE TOK_CAT_TITLE = query.TOK_CAT_TITLE TOK_CAT_AUTHOR = query.TOK_CAT_AUTHOR TOK_CAT_DATE = query.TOK_CAT_DATE @@ -77,7 +78,7 @@ func TestLex(t *testing.T) { {Type: TOK_CLAUSE_END}, {Type: TOK_CLAUSE_END}, }}, - {"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo", []Token{ + {"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo t!re!bar", []Token{ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"}, {TOK_CAT_AUTHOR, "a"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "a"}, {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_OR, "or"}, @@ -88,6 +89,7 @@ func TestLex(t *testing.T) { {TOK_CAT_DATE, "d"}, {TOK_OP_LE, "<="}, {TOK_VAL_DATETIME, "01010001"}, {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"}, {TOK_OP_NEG, "-"}, {TOK_CAT_TITLE, "T"}, {TOK_OP_AP, "~"}, {TOK_VAL_STR, "foo"}, + {TOK_CAT_TAGS, "t"}, {TOK_OP_RE, "!re!"}, {TOK_VAL_STR, "bar"}, {Type: TOK_CLAUSE_END}, {Type: TOK_CLAUSE_END}, {Type: TOK_CLAUSE_END}, diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go index 3c435e8..2cc610a 100644 --- a/pkg/query/optimizer.go +++ b/pkg/query/optimizer.go @@ -1,6 +1,7 @@ package query import ( + "bytes" "slices" "strings" "sync" @@ -52,7 +53,6 @@ func (o Optimizer) Optimize(level int) { if level < 0 { return } else if level == 0 { - // TODO: determine smarter level determination strategy level = o.root.Depth() } @@ -63,6 +63,7 @@ func (o Optimizer) Optimize(level int) { o.StrictEquality() o.Tighten() o.Contradictions() + o.MergeRegex() // parallel + serial o.Tidy() // purely serial @@ -326,6 +327,66 @@ func (o Optimizer) StrictEquality() { }) } +// Merge regular within a clause +func (o *Optimizer) MergeRegex() { + if !o.isSorted { + o.SortStatements() + } + + pool := &sync.Pool{} + pool.New = func() any { + return &bytes.Buffer{} + } + + o.parallel(func(c *Clause) { + if c.Operator != COP_OR { + return + } + + buf := pool.Get().(*bytes.Buffer) + defer pool.Put(buf) + defer buf.Reset() + sortChanged := false + for _, catStmts := range c.Statements.CategoryPartition() { + for op, opStmts := range catStmts.OperatorPartition() { + if op != OP_RE { + continue + } + + for _, stmts := range opStmts.NegatedPartition() { + if len(stmts) <= 1 { + continue + } + sortChanged = true + + for i, stmt := range stmts { + if i == 0 { + buf.WriteByte('(') + buf.WriteString(stmt.Value.(StringValue).S) + buf.WriteByte('|') + } else if i == len(stmts)-1 { + buf.WriteString(stmt.Value.(StringValue).S) + buf.WriteByte(')') + stmts[i] = Statement{} + } else { + buf.WriteString(stmt.Value.(StringValue).S) + buf.WriteByte('|') + stmts[i] = Statement{} + } + } + stmts[0].Value = StringValue{S: buf.String()} + buf.Reset() + } + + break + } + } + if sortChanged { + o.isSorted = false + } + }) +} + // Shrink approximate statements and ranges // // Examples: diff --git a/pkg/query/optimizer_test.go b/pkg/query/optimizer_test.go index 1826c81..bb9c0ba 100644 --- a/pkg/query/optimizer_test.go +++ b/pkg/query/optimizer_test.go @@ -647,3 +647,75 @@ func TestOptimizer_Tighten(t *testing.T) { }) } } + +func TestOptimizer_MergeRegex(t *testing.T) { + tests := []struct { + name string + c *query.Clause + want query.Clause + }{ + { + "only positve", + &query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"a"}}, + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"b"}}, + }, + }, + query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"(a|b)"}}, + }, + }, + }, + { + "multiple categories", + &query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"a"}}, + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"b"}}, + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"c"}}, + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"d"}}, + }, + }, + query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"(a|b)"}}, + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(c|d)"}}, + }, + }, + }, + { + "mixed negations", + &query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"a"}}, + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"b"}}, + {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"c"}}, + {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"d"}}, + }, + }, + query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(a|b)"}}, + {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(c|d)"}}, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + o := query.NewOptimizer(tt.c, WORKERS) + o.MergeRegex() + o.Tidy() + + clauseEqTest(t, tt.c, &tt.want) + }) + } +} diff --git a/pkg/query/parser.go b/pkg/query/parser.go index 0b33a37..ce4ee7b 100644 --- a/pkg/query/parser.go +++ b/pkg/query/parser.go @@ -36,6 +36,7 @@ const ( OP_LE // less than or equal OP_GE // greater than or equal OP_GT // greater than + OP_RE // regular expresion ) type clauseOperator int16 @@ -165,7 +166,7 @@ func (t catType) String() string { } func (t opType) IsFuzzy() bool { - return t == OP_AP || t.IsOrder() + return t == OP_AP || t == OP_RE || t.IsOrder() } func (t opType) IsOrder() bool { @@ -188,6 +189,8 @@ func (t opType) String() string { return "Greater Than or Equal" case OP_GT: return "Greater Than" + case OP_RE: + return "Regular Expression" default: return "Invalid" } @@ -234,6 +237,8 @@ func tokToOp(t queryTokenType) opType { return OP_GE case TOK_OP_GT: return OP_GT + case TOK_OP_RE: + return OP_RE default: return OP_UNKNOWN } @@ -241,7 +246,7 @@ func tokToOp(t queryTokenType) opType { // Apply negation to a statements operator func (s *Statement) Simplify() { - if s.Negated && s.Operator != OP_AP { + if s.Negated && s.Operator != OP_AP && s.Operator != OP_RE { s.Negated = false switch s.Operator { case OP_EQ: @@ -324,23 +329,15 @@ func (s Statements) NegatedPartition() iter.Seq2[bool, Statements] { } return func(yield func(bool, Statements) bool) { - firstNegated := -1 - for i, stmt := range s { - if stmt.Negated { - firstNegated = i - } - } - - if firstNegated > 0 { - if !yield(false, s[:firstNegated]) { - return - } else if !yield(true, s[firstNegated:]) { - return - } - } else { - if !yield(false, s) { - return - } + firstNegated := slices.IndexFunc(s, func(stmt Statement) bool { return stmt.Negated }) + if firstNegated == -1 { + yield(false, s) + return + } else if firstNegated == 0 { + yield(true, s) + return + } else if !yield(false, s[:firstNegated]) { + } else if !yield(true, s[firstNegated:]) { } } } @@ -507,7 +504,7 @@ func Parse(tokens []Token) (*Clause, error) { stmt := Statement{Category: tokToCat(token.Type)} clause.Statements = append(clause.Statements, stmt) } - case TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT: + case TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT, TOK_OP_RE: if !prevToken.Type.isCategory() { return nil, &TokenError{ got: token, @@ -518,21 +515,21 @@ func Parse(tokens []Token) (*Clause, error) { clause.Statements[len(clause.Statements)-1].Operator = tokToOp(token.Type) case TOK_VAL_STR: - if !prevToken.Type.isOperation() { + if !prevToken.Type.isStringOperation() { return nil, &TokenError{ got: token, gotPrev: prevToken, - wantPrev: "operation", + wantPrev: "string operation", } } clause.Statements[len(clause.Statements)-1].Value = StringValue{token.Value} case TOK_VAL_DATETIME: - if !prevToken.Type.isOperation() { + if !prevToken.Type.isDateOperation() { return nil, &TokenError{ got: token, gotPrev: prevToken, - wantPrev: "operation", + wantPrev: "date operation", } } diff --git a/pkg/query/parser_test.go b/pkg/query/parser_test.go index 5f68006..2e85123 100644 --- a/pkg/query/parser_test.go +++ b/pkg/query/parser_test.go @@ -26,6 +26,7 @@ const ( OP_LE = query.OP_LE OP_GE = query.OP_GE OP_GT = query.OP_GT + OP_RE = query.OP_RE ) func TestParse(t *testing.T) { -- cgit v1.2.3