diff options
| author | JP Appel <jeanpierre.appel01@gmail.com> | 2025-07-28 01:20:01 -0400 |
|---|---|---|
| committer | JP Appel <jeanpierre.appel01@gmail.com> | 2025-07-28 01:20:01 -0400 |
| commit | 3b8dcd30f5aca7624a22cff85a2f767d8d1fb583 (patch) | |
| tree | db01e57776ef9a8bc104403cd038e303c6831500 /pkg | |
| parent | 7b5cd075161bd4e1a05070d51cc64b38882ae74b (diff) | |
Add regex operator
Implemented regex operator using go flavored regular expressions.
Added optimization to combine regex's in `OR` clauses.
Diffstat (limited to 'pkg')
| -rw-r--r-- | pkg/data/db.go | 21 | ||||
| -rw-r--r-- | pkg/query/compiler.go | 26 | ||||
| -rw-r--r-- | pkg/query/lexer.go | 21 | ||||
| -rw-r--r-- | pkg/query/lexer_test.go | 4 | ||||
| -rw-r--r-- | pkg/query/optimizer.go | 63 | ||||
| -rw-r--r-- | pkg/query/optimizer_test.go | 72 | ||||
| -rw-r--r-- | pkg/query/parser.go | 45 | ||||
| -rw-r--r-- | pkg/query/parser_test.go | 1 | ||||
| -rw-r--r-- | pkg/shell/interpreter.go | 4 |
9 files changed, 221 insertions, 36 deletions
diff --git a/pkg/data/db.go b/pkg/data/db.go index 178c19d..cf58caf 100644 --- a/pkg/data/db.go +++ b/pkg/data/db.go @@ -5,12 +5,13 @@ import ( "database/sql" "fmt" "log/slog" + "regexp" "strings" "time" "github.com/jpappel/atlas/pkg/index" "github.com/jpappel/atlas/pkg/query" - _ "github.com/mattn/go-sqlite3" + "github.com/mattn/go-sqlite3" ) type Query struct { @@ -51,7 +52,7 @@ func NewQuery(filename string) *Query { func NewDB(filename string) *sql.DB { connStr := "file:" + filename + "?_fk=true&_journal=WAL" - db, err := sql.Open("sqlite3", connStr) + db, err := sql.Open("sqlite3_regex", connStr) if err != nil { panic(err) } @@ -64,7 +65,7 @@ func NewDB(filename string) *sql.DB { } func NewMemDB() *sql.DB { - db, err := sql.Open("sqlite3", ":memory:?_fk=true") + db, err := sql.Open("sqlite3_regex", ":memory:?_fk=true") if err != nil { panic(err) } @@ -420,3 +421,17 @@ func (q Query) Execute(artifact query.CompilationArtifact) (map[string]*index.Do return f.docs, nil } + +func regex(re, s string) (bool, error) { + return regexp.MatchString(re, s) +} + +func init() { + sql.Register("sqlite3_regex", + &sqlite3.SQLiteDriver{ + ConnectHook: func(sc *sqlite3.SQLiteConn) error { + return sc.RegisterFunc("regexp", regex, true) + }, + }, + ) +} diff --git a/pkg/query/compiler.go b/pkg/query/compiler.go index 1e946a6..7000544 100644 --- a/pkg/query/compiler.go +++ b/pkg/query/compiler.go @@ -93,6 +93,8 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error case OP_LT: // NOTE: doesn't raise compiler error if operator used on invalid category opStr = "< " + case OP_RE: + opStr = "REGEXP " case OP_NE: if cat.IsSet() { opStr = "NOT IN " @@ -164,6 +166,30 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error idx++ sCount++ } + } else if op == OP_RE { + idx := 0 + for _, stmt := range opStmts { + b.WriteString("( ") + b.WriteString(catStr) + b.WriteString("IS NOT NULL AND ") + if stmt.Negated { + b.WriteString("NOT ") + } + b.WriteString(catStr) + b.WriteString(opStr) + arg, ok := stmt.Value.buildCompile(b) + b.WriteString(" )") + if ok { + args = append(args, arg) + } + b.WriteByte(' ') + if idx != len(opStmts)-1 { + b.WriteString(delim) + b.WriteByte(' ') + } + idx++ + sCount++ + } } else { idx := 0 for _, stmt := range opStmts { diff --git a/pkg/query/lexer.go b/pkg/query/lexer.go index 79b6f0f..de344b7 100644 --- a/pkg/query/lexer.go +++ b/pkg/query/lexer.go @@ -29,7 +29,8 @@ const ( TOK_OP_LT // less than TOK_OP_LE // less than or equal TOK_OP_GE // greater than or equal - TOK_OP_GT // greaterthan + TOK_OP_GT // greater than + TOK_OP_RE // regex match // categories TOK_CAT_PATH TOK_CAT_TITLE @@ -67,6 +68,8 @@ func (tokType queryTokenType) String() string { return "Equal" case TOK_OP_AP: return "Approximate" + case TOK_OP_RE: + return "Regular Expression" case TOK_OP_NE: return "Not Equal" case TOK_OP_LT: @@ -118,16 +121,18 @@ func (tokType queryTokenType) Any(expected ...queryTokenType) bool { return slices.Contains(expected, tokType) } -func (t queryTokenType) isClause() bool { - return t == TOK_CLAUSE_OR || t == TOK_CLAUSE_AND || t == TOK_CLAUSE_START || t == TOK_CLAUSE_END -} - func (t queryTokenType) isCategory() bool { return t.Any(TOK_CAT_PATH, TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_DATE, TOK_CAT_FILETIME, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META) } -func (t queryTokenType) isOperation() bool { + +func (t queryTokenType) isDateOperation() bool { return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT) } + +func (t queryTokenType) isStringOperation() bool { + return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_RE) +} + func (t queryTokenType) isValue() bool { return t == TOK_VAL_STR || t == TOK_VAL_DATETIME } @@ -233,6 +238,8 @@ func tokenizeOperation(s string) Token { t.Type = TOK_OP_LT case ">": t.Type = TOK_OP_GT + case "!re!": + t.Type = TOK_OP_RE } return t @@ -321,7 +328,7 @@ func TokensStringify(tokens []Token) string { func init() { negPattern := `(?<negation>-?)` categoryPattern := `(?<category>T|p(?:ath)?|a(?:uthor)?|d(?:ate)?|f(?:iletime)?|t(?:ags|itle)?|l(?:inks)?|m(?:eta)?)` - opPattern := `(?<operator>!=|<=|>=|=|:|~|<|>)` + opPattern := `(?<operator>!re!|!=|<=|>=|=|:|~|<|>)` valPattern := `(?<value>".*?"|\S*[^\s\)])` statementPattern := `(?<statement>` + negPattern + categoryPattern + opPattern + valPattern + `)` unknownPattern := `(?<unknown>\S*".*?"[^\s)]*|\S*[^\s\)])` diff --git a/pkg/query/lexer_test.go b/pkg/query/lexer_test.go index 5888902..35bdc4f 100644 --- a/pkg/query/lexer_test.go +++ b/pkg/query/lexer_test.go @@ -22,6 +22,7 @@ const ( TOK_OP_LE = query.TOK_OP_LE TOK_OP_GE = query.TOK_OP_GE TOK_OP_GT = query.TOK_OP_GT + TOK_OP_RE = query.TOK_OP_RE TOK_CAT_TITLE = query.TOK_CAT_TITLE TOK_CAT_AUTHOR = query.TOK_CAT_AUTHOR TOK_CAT_DATE = query.TOK_CAT_DATE @@ -77,7 +78,7 @@ func TestLex(t *testing.T) { {Type: TOK_CLAUSE_END}, {Type: TOK_CLAUSE_END}, }}, - {"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo", []Token{ + {"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo t!re!bar", []Token{ {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"}, {TOK_CAT_AUTHOR, "a"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "a"}, {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_OR, "or"}, @@ -88,6 +89,7 @@ func TestLex(t *testing.T) { {TOK_CAT_DATE, "d"}, {TOK_OP_LE, "<="}, {TOK_VAL_DATETIME, "01010001"}, {Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"}, {TOK_OP_NEG, "-"}, {TOK_CAT_TITLE, "T"}, {TOK_OP_AP, "~"}, {TOK_VAL_STR, "foo"}, + {TOK_CAT_TAGS, "t"}, {TOK_OP_RE, "!re!"}, {TOK_VAL_STR, "bar"}, {Type: TOK_CLAUSE_END}, {Type: TOK_CLAUSE_END}, {Type: TOK_CLAUSE_END}, diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go index 3c435e8..2cc610a 100644 --- a/pkg/query/optimizer.go +++ b/pkg/query/optimizer.go @@ -1,6 +1,7 @@ package query import ( + "bytes" "slices" "strings" "sync" @@ -52,7 +53,6 @@ func (o Optimizer) Optimize(level int) { if level < 0 { return } else if level == 0 { - // TODO: determine smarter level determination strategy level = o.root.Depth() } @@ -63,6 +63,7 @@ func (o Optimizer) Optimize(level int) { o.StrictEquality() o.Tighten() o.Contradictions() + o.MergeRegex() // parallel + serial o.Tidy() // purely serial @@ -326,6 +327,66 @@ func (o Optimizer) StrictEquality() { }) } +// Merge regular within a clause +func (o *Optimizer) MergeRegex() { + if !o.isSorted { + o.SortStatements() + } + + pool := &sync.Pool{} + pool.New = func() any { + return &bytes.Buffer{} + } + + o.parallel(func(c *Clause) { + if c.Operator != COP_OR { + return + } + + buf := pool.Get().(*bytes.Buffer) + defer pool.Put(buf) + defer buf.Reset() + sortChanged := false + for _, catStmts := range c.Statements.CategoryPartition() { + for op, opStmts := range catStmts.OperatorPartition() { + if op != OP_RE { + continue + } + + for _, stmts := range opStmts.NegatedPartition() { + if len(stmts) <= 1 { + continue + } + sortChanged = true + + for i, stmt := range stmts { + if i == 0 { + buf.WriteByte('(') + buf.WriteString(stmt.Value.(StringValue).S) + buf.WriteByte('|') + } else if i == len(stmts)-1 { + buf.WriteString(stmt.Value.(StringValue).S) + buf.WriteByte(')') + stmts[i] = Statement{} + } else { + buf.WriteString(stmt.Value.(StringValue).S) + buf.WriteByte('|') + stmts[i] = Statement{} + } + } + stmts[0].Value = StringValue{S: buf.String()} + buf.Reset() + } + + break + } + } + if sortChanged { + o.isSorted = false + } + }) +} + // Shrink approximate statements and ranges // // Examples: diff --git a/pkg/query/optimizer_test.go b/pkg/query/optimizer_test.go index 1826c81..bb9c0ba 100644 --- a/pkg/query/optimizer_test.go +++ b/pkg/query/optimizer_test.go @@ -647,3 +647,75 @@ func TestOptimizer_Tighten(t *testing.T) { }) } } + +func TestOptimizer_MergeRegex(t *testing.T) { + tests := []struct { + name string + c *query.Clause + want query.Clause + }{ + { + "only positve", + &query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"a"}}, + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"b"}}, + }, + }, + query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"(a|b)"}}, + }, + }, + }, + { + "multiple categories", + &query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"a"}}, + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"b"}}, + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"c"}}, + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"d"}}, + }, + }, + query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"(a|b)"}}, + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(c|d)"}}, + }, + }, + }, + { + "mixed negations", + &query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"a"}}, + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"b"}}, + {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"c"}}, + {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"d"}}, + }, + }, + query.Clause{ + Operator: query.COP_OR, + Statements: []query.Statement{ + {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(a|b)"}}, + {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(c|d)"}}, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + o := query.NewOptimizer(tt.c, WORKERS) + o.MergeRegex() + o.Tidy() + + clauseEqTest(t, tt.c, &tt.want) + }) + } +} diff --git a/pkg/query/parser.go b/pkg/query/parser.go index 0b33a37..ce4ee7b 100644 --- a/pkg/query/parser.go +++ b/pkg/query/parser.go @@ -36,6 +36,7 @@ const ( OP_LE // less than or equal OP_GE // greater than or equal OP_GT // greater than + OP_RE // regular expresion ) type clauseOperator int16 @@ -165,7 +166,7 @@ func (t catType) String() string { } func (t opType) IsFuzzy() bool { - return t == OP_AP || t.IsOrder() + return t == OP_AP || t == OP_RE || t.IsOrder() } func (t opType) IsOrder() bool { @@ -188,6 +189,8 @@ func (t opType) String() string { return "Greater Than or Equal" case OP_GT: return "Greater Than" + case OP_RE: + return "Regular Expression" default: return "Invalid" } @@ -234,6 +237,8 @@ func tokToOp(t queryTokenType) opType { return OP_GE case TOK_OP_GT: return OP_GT + case TOK_OP_RE: + return OP_RE default: return OP_UNKNOWN } @@ -241,7 +246,7 @@ func tokToOp(t queryTokenType) opType { // Apply negation to a statements operator func (s *Statement) Simplify() { - if s.Negated && s.Operator != OP_AP { + if s.Negated && s.Operator != OP_AP && s.Operator != OP_RE { s.Negated = false switch s.Operator { case OP_EQ: @@ -324,23 +329,15 @@ func (s Statements) NegatedPartition() iter.Seq2[bool, Statements] { } return func(yield func(bool, Statements) bool) { - firstNegated := -1 - for i, stmt := range s { - if stmt.Negated { - firstNegated = i - } - } - - if firstNegated > 0 { - if !yield(false, s[:firstNegated]) { - return - } else if !yield(true, s[firstNegated:]) { - return - } - } else { - if !yield(false, s) { - return - } + firstNegated := slices.IndexFunc(s, func(stmt Statement) bool { return stmt.Negated }) + if firstNegated == -1 { + yield(false, s) + return + } else if firstNegated == 0 { + yield(true, s) + return + } else if !yield(false, s[:firstNegated]) { + } else if !yield(true, s[firstNegated:]) { } } } @@ -507,7 +504,7 @@ func Parse(tokens []Token) (*Clause, error) { stmt := Statement{Category: tokToCat(token.Type)} clause.Statements = append(clause.Statements, stmt) } - case TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT: + case TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT, TOK_OP_RE: if !prevToken.Type.isCategory() { return nil, &TokenError{ got: token, @@ -518,21 +515,21 @@ func Parse(tokens []Token) (*Clause, error) { clause.Statements[len(clause.Statements)-1].Operator = tokToOp(token.Type) case TOK_VAL_STR: - if !prevToken.Type.isOperation() { + if !prevToken.Type.isStringOperation() { return nil, &TokenError{ got: token, gotPrev: prevToken, - wantPrev: "operation", + wantPrev: "string operation", } } clause.Statements[len(clause.Statements)-1].Value = StringValue{token.Value} case TOK_VAL_DATETIME: - if !prevToken.Type.isOperation() { + if !prevToken.Type.isDateOperation() { return nil, &TokenError{ got: token, gotPrev: prevToken, - wantPrev: "operation", + wantPrev: "date operation", } } diff --git a/pkg/query/parser_test.go b/pkg/query/parser_test.go index 5f68006..2e85123 100644 --- a/pkg/query/parser_test.go +++ b/pkg/query/parser_test.go @@ -26,6 +26,7 @@ const ( OP_LE = query.OP_LE OP_GE = query.OP_GE OP_GT = query.OP_GT + OP_RE = query.OP_RE ) func TestParse(t *testing.T) { diff --git a/pkg/shell/interpreter.go b/pkg/shell/interpreter.go index bb0a441..c244c9e 100644 --- a/pkg/shell/interpreter.go +++ b/pkg/shell/interpreter.go @@ -86,6 +86,7 @@ var optimizations = []string{ "contradictions", "compact", "strictEq", + "mergeregex", } var commands = map[string]ITokType{ @@ -459,6 +460,8 @@ out: o.Compact() case "strictEq": o.StrictEquality() + case "mergeregex": + o.MergeRegex() default: suggestion, ok := util.Nearest( optName, @@ -781,6 +784,7 @@ func PrintHelp(w io.Writer) { fmt.Fprintln(w, " contradictions - zero contradicting statements and clauses") fmt.Fprintln(w, " strictEq - zero fuzzy/range statements when an eq is present") fmt.Fprintln(w, " tighten - zero redundant fuzzy/range statements when another mathes the same values") + fmt.Fprintln(w, " mergeregex - merge regexes") fmt.Fprintln(w, "compile (clause) - compile clause into query") fmt.Fprintln(w, "execute (artifact) - excute the compiled query against the connected database") fmt.Fprintln(w, "\nBare commands which return a value assign to an implicit variable _") |
