From 92de2b63b6bd0642b92e7ca1c6110bab7f3a2e6b Mon Sep 17 00:00:00 2001 From: JP Appel Date: Sun, 10 Aug 2025 04:04:41 -0400 Subject: Change approximate statmenets to use sqlite MATCH operator --- pkg/query/compiler.go | 16 +++-------- pkg/query/optimizer.go | 72 +++++++++++++++++++++++++++++++++++++++++++----- pkg/query/parser.go | 6 +++- pkg/query/parser_test.go | 4 +-- 4 files changed, 76 insertions(+), 22 deletions(-) (limited to 'pkg/query') diff --git a/pkg/query/compiler.go b/pkg/query/compiler.go index efdc3b4..dc6f93a 100644 --- a/pkg/query/compiler.go +++ b/pkg/query/compiler.go @@ -73,7 +73,7 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error if cat.IsOrdered() { opStr = "BETWEEN " } else { - opStr = "LIKE " + opStr = "MATCH " } case OP_EQ: if cat.IsSet() { @@ -82,16 +82,12 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error opStr = "= " } case OP_GE: - // NOTE: doesn't raise compiler error if operator used on invalid category opStr = ">= " case OP_GT: - // NOTE: doesn't raise compiler error if operator used on invalid category opStr = "> " case OP_LE: - // NOTE: doesn't raise compiler error if operator used on invalid category opStr = "<= " case OP_LT: - // NOTE: doesn't raise compiler error if operator used on invalid category opStr = "< " case OP_RE: opStr = "REGEXP " @@ -156,10 +152,10 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error b.WriteString(opStr) arg, ok := stmt.Value.buildCompile(b) if ok { - args = append(args, "%"+arg+"%") + args = append(args, arg) } if idx != len(opStmts)-1 { - b.WriteString(" OR ") + b.WriteString(" " + delim + " ") } sCount++ idx++ @@ -200,11 +196,7 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error b.WriteString(opStr) arg, ok := stmt.Value.buildCompile(b) if ok { - if op == OP_AP { - args = append(args, "%"+arg+"%") - } else { - args = append(args, arg) - } + args = append(args, arg) } b.WriteByte(' ') if idx != len(opStmts)-1 { diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go index 2cc610a..337ee35 100644 --- a/pkg/query/optimizer.go +++ b/pkg/query/optimizer.go @@ -9,6 +9,9 @@ import ( "github.com/jpappel/atlas/pkg/util" ) +// FIXME: any substring checks on unorderd approximate statements will fail +// this is because quotes are added to all approximate string values + type Optimizer struct { workers uint root *Clause @@ -64,6 +67,7 @@ func (o Optimizer) Optimize(level int) { o.Tighten() o.Contradictions() o.MergeRegex() + o.MergeApproximateMatches() // parallel + serial o.Tidy() // purely serial @@ -175,6 +179,7 @@ func (o *Optimizer) Compact() { o.isSorted = false } +// Remove noop statements and clauses func (o *Optimizer) Tidy() { // ensure ordering if !o.isSorted { @@ -306,7 +311,7 @@ func (o Optimizer) StrictEquality() { stricts = append(stricts, val) case OP_AP: if slices.ContainsFunc(stricts, func(strictStr string) bool { - return strings.Contains(strictStr, val) || strings.Contains(val, strictStr) + return util.ContainsSliced(strictStr, val, 1, len(val)-1) || util.ContainsSliced(val, strictStr, 1, len(strictStr)-1) }) { stmts[i] = Statement{} o.isSorted = false @@ -327,7 +332,7 @@ func (o Optimizer) StrictEquality() { }) } -// Merge regular within a clause +// Merge regular expressions within a clause func (o *Optimizer) MergeRegex() { if !o.isSorted { o.SortStatements() @@ -354,7 +359,7 @@ func (o *Optimizer) MergeRegex() { } for _, stmts := range opStmts.NegatedPartition() { - if len(stmts) <= 1 { + if len(stmts) < 2 { continue } sortChanged = true @@ -387,6 +392,59 @@ func (o *Optimizer) MergeRegex() { }) } +func (o *Optimizer) MergeApproximateMatches() { + if !o.isSorted { + o.SortStatements() + } + + pool := &sync.Pool{} + pool.New = func() any { + return &strings.Builder{} + } + + o.parallel(func(c *Clause) { + var delim string + switch c.Operator { + case COP_AND: + delim = " AND " + case COP_OR: + delim = " OR " + } + + b := pool.Get().(*strings.Builder) + defer pool.Put(b) + defer b.Reset() + + changeSort := false + for category, catStmts := range c.Statements.CategoryPartition() { + if len(catStmts) < 2 || category.IsOrdered() { + continue + } + for op, opStmts := range catStmts.OperatorPartition() { + if op != OP_AP || len(opStmts) < 2 { + continue + } + changeSort = true + for i, stmt := range opStmts { + b.WriteString(stmt.Value.(StringValue).S) + if i != len(opStmts)-1 { + b.WriteString(delim) + } + + if i != 0 { + opStmts[i] = Statement{} + } + } + opStmts[0].Value = StringValue{S: b.String()} + b.Reset() + } + } + if changeSort { + o.isSorted = false + } + }) +} + // Shrink approximate statements and ranges // // Examples: @@ -468,9 +526,9 @@ func (o *Optimizer) Tighten() { val1 := strings.ToLower(s1.Value.(StringValue).S) for j, s2 := range util.FilterIter(stmts[i+1:], func(s Statement) bool { return s.Operator == OP_AP }) { val2 := strings.ToLower(s2.Value.(StringValue).S) - if strings.Contains(val2, val1) { + if util.ContainsSliced(val2, val1, 1, len(val1)-1) { removals[i] = true - } else if strings.Contains(val1, val2) { + } else if util.ContainsSliced(val1, val2, 1, len(val2)-1) { removals[j] = true } } @@ -516,10 +574,10 @@ func (o *Optimizer) Tighten() { val1 := strings.ToLower(s1.Value.(StringValue).S) for j, s2 := range util.FilterIter(stmts[i+1:], func(s Statement) bool { return s.Operator == OP_AP }) { val2 := strings.ToLower(s2.Value.(StringValue).S) - if strings.Contains(val2, val1) { + if util.ContainsSliced(val2, val1, 1, len(val1)-1) { // NOTE: slicing stmts offsets the all indices by 1, hence the correction removals[j+1] = true - } else if strings.Contains(val1, val2) { + } else if util.ContainsSliced(val1, val2, 1, len(val2)-1) { removals[i] = true } } diff --git a/pkg/query/parser.go b/pkg/query/parser.go index 3406bc6..019deac 100644 --- a/pkg/query/parser.go +++ b/pkg/query/parser.go @@ -527,7 +527,11 @@ func Parse(tokens []Token) (*Clause, error) { } } - clause.Statements[len(clause.Statements)-1].Value = StringValue{token.Value} + if prevToken.Type == TOK_OP_AP { + clause.Statements[len(clause.Statements)-1].Value = StringValue{"\"" + token.Value + "\""} + } else { + clause.Statements[len(clause.Statements)-1].Value = StringValue{token.Value} + } case TOK_VAL_DATETIME: if !prevToken.Type.isDateOperation() { return nil, &TokenError{ diff --git a/pkg/query/parser_test.go b/pkg/query/parser_test.go index 5c837c0..b1a98f1 100644 --- a/pkg/query/parser_test.go +++ b/pkg/query/parser_test.go @@ -53,7 +53,7 @@ func TestParse(t *testing.T) { &query.Clause{ Operator: query.COP_AND, Statements: []query.Statement{ - {Category: CAT_AUTHOR, Operator: OP_AP, Value: query.StringValue{"ken thompson"}}, + {Category: CAT_AUTHOR, Operator: OP_AP, Value: query.StringValue{"\"ken thompson\""}}, }, }, nil, @@ -70,7 +70,7 @@ func TestParse(t *testing.T) { &query.Clause{ Operator: query.COP_AND, Statements: []query.Statement{ - {Category: CAT_AUTHOR, Operator: OP_AP, Value: query.StringValue{"Alonzo Church"}}, + {Category: CAT_AUTHOR, Operator: OP_AP, Value: query.StringValue{"\"Alonzo Church\""}}, }, Clauses: []*query.Clause{ { -- cgit v1.2.3