diff options
| author | JP Appel <jeanpierre.appel01@gmail.com> | 2025-08-10 04:04:41 -0400 |
|---|---|---|
| committer | JP Appel <jeanpierre.appel01@gmail.com> | 2025-08-10 04:04:41 -0400 |
| commit | 92de2b63b6bd0642b92e7ca1c6110bab7f3a2e6b (patch) | |
| tree | e32f3759413e992fa2fcd01c7f254c17d4bf8eed /pkg | |
| parent | 5d0b36cf87ee94c690d11c0beab48f4dadc6fc52 (diff) | |
Change approximate statmenets to use sqlite MATCH operator
Diffstat (limited to 'pkg')
| -rw-r--r-- | pkg/query/compiler.go | 16 | ||||
| -rw-r--r-- | pkg/query/optimizer.go | 72 | ||||
| -rw-r--r-- | pkg/query/parser.go | 6 | ||||
| -rw-r--r-- | pkg/query/parser_test.go | 4 | ||||
| -rw-r--r-- | pkg/shell/interpreter.go | 68 | ||||
| -rw-r--r-- | pkg/util/util.go | 8 |
6 files changed, 121 insertions, 53 deletions
diff --git a/pkg/query/compiler.go b/pkg/query/compiler.go index efdc3b4..dc6f93a 100644 --- a/pkg/query/compiler.go +++ b/pkg/query/compiler.go @@ -73,7 +73,7 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error if cat.IsOrdered() { opStr = "BETWEEN " } else { - opStr = "LIKE " + opStr = "MATCH " } case OP_EQ: if cat.IsSet() { @@ -82,16 +82,12 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error opStr = "= " } case OP_GE: - // NOTE: doesn't raise compiler error if operator used on invalid category opStr = ">= " case OP_GT: - // NOTE: doesn't raise compiler error if operator used on invalid category opStr = "> " case OP_LE: - // NOTE: doesn't raise compiler error if operator used on invalid category opStr = "<= " case OP_LT: - // NOTE: doesn't raise compiler error if operator used on invalid category opStr = "< " case OP_RE: opStr = "REGEXP " @@ -156,10 +152,10 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error b.WriteString(opStr) arg, ok := stmt.Value.buildCompile(b) if ok { - args = append(args, "%"+arg+"%") + args = append(args, arg) } if idx != len(opStmts)-1 { - b.WriteString(" OR ") + b.WriteString(" " + delim + " ") } sCount++ idx++ @@ -200,11 +196,7 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error b.WriteString(opStr) arg, ok := stmt.Value.buildCompile(b) if ok { - if op == OP_AP { - args = append(args, "%"+arg+"%") - } else { - args = append(args, arg) - } + args = append(args, arg) } b.WriteByte(' ') if idx != len(opStmts)-1 { diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go index 2cc610a..337ee35 100644 --- a/pkg/query/optimizer.go +++ b/pkg/query/optimizer.go @@ -9,6 +9,9 @@ import ( "github.com/jpappel/atlas/pkg/util" ) +// FIXME: any substring checks on unorderd approximate statements will fail +// this is because quotes are added to all approximate string values + type Optimizer struct { workers uint root *Clause @@ -64,6 +67,7 @@ func (o Optimizer) Optimize(level int) { o.Tighten() o.Contradictions() o.MergeRegex() + o.MergeApproximateMatches() // parallel + serial o.Tidy() // purely serial @@ -175,6 +179,7 @@ func (o *Optimizer) Compact() { o.isSorted = false } +// Remove noop statements and clauses func (o *Optimizer) Tidy() { // ensure ordering if !o.isSorted { @@ -306,7 +311,7 @@ func (o Optimizer) StrictEquality() { stricts = append(stricts, val) case OP_AP: if slices.ContainsFunc(stricts, func(strictStr string) bool { - return strings.Contains(strictStr, val) || strings.Contains(val, strictStr) + return util.ContainsSliced(strictStr, val, 1, len(val)-1) || util.ContainsSliced(val, strictStr, 1, len(strictStr)-1) }) { stmts[i] = Statement{} o.isSorted = false @@ -327,7 +332,7 @@ func (o Optimizer) StrictEquality() { }) } -// Merge regular within a clause +// Merge regular expressions within a clause func (o *Optimizer) MergeRegex() { if !o.isSorted { o.SortStatements() @@ -354,7 +359,7 @@ func (o *Optimizer) MergeRegex() { } for _, stmts := range opStmts.NegatedPartition() { - if len(stmts) <= 1 { + if len(stmts) < 2 { continue } sortChanged = true @@ -387,6 +392,59 @@ func (o *Optimizer) MergeRegex() { }) } +func (o *Optimizer) MergeApproximateMatches() { + if !o.isSorted { + o.SortStatements() + } + + pool := &sync.Pool{} + pool.New = func() any { + return &strings.Builder{} + } + + o.parallel(func(c *Clause) { + var delim string + switch c.Operator { + case COP_AND: + delim = " AND " + case COP_OR: + delim = " OR " + } + + b := pool.Get().(*strings.Builder) + defer pool.Put(b) + defer b.Reset() + + changeSort := false + for category, catStmts := range c.Statements.CategoryPartition() { + if len(catStmts) < 2 || category.IsOrdered() { + continue + } + for op, opStmts := range catStmts.OperatorPartition() { + if op != OP_AP || len(opStmts) < 2 { + continue + } + changeSort = true + for i, stmt := range opStmts { + b.WriteString(stmt.Value.(StringValue).S) + if i != len(opStmts)-1 { + b.WriteString(delim) + } + + if i != 0 { + opStmts[i] = Statement{} + } + } + opStmts[0].Value = StringValue{S: b.String()} + b.Reset() + } + } + if changeSort { + o.isSorted = false + } + }) +} + // Shrink approximate statements and ranges // // Examples: @@ -468,9 +526,9 @@ func (o *Optimizer) Tighten() { val1 := strings.ToLower(s1.Value.(StringValue).S) for j, s2 := range util.FilterIter(stmts[i+1:], func(s Statement) bool { return s.Operator == OP_AP }) { val2 := strings.ToLower(s2.Value.(StringValue).S) - if strings.Contains(val2, val1) { + if util.ContainsSliced(val2, val1, 1, len(val1)-1) { removals[i] = true - } else if strings.Contains(val1, val2) { + } else if util.ContainsSliced(val1, val2, 1, len(val2)-1) { removals[j] = true } } @@ -516,10 +574,10 @@ func (o *Optimizer) Tighten() { val1 := strings.ToLower(s1.Value.(StringValue).S) for j, s2 := range util.FilterIter(stmts[i+1:], func(s Statement) bool { return s.Operator == OP_AP }) { val2 := strings.ToLower(s2.Value.(StringValue).S) - if strings.Contains(val2, val1) { + if util.ContainsSliced(val2, val1, 1, len(val1)-1) { // NOTE: slicing stmts offsets the all indices by 1, hence the correction removals[j+1] = true - } else if strings.Contains(val1, val2) { + } else if util.ContainsSliced(val1, val2, 1, len(val2)-1) { removals[i] = true } } diff --git a/pkg/query/parser.go b/pkg/query/parser.go index 3406bc6..019deac 100644 --- a/pkg/query/parser.go +++ b/pkg/query/parser.go @@ -527,7 +527,11 @@ func Parse(tokens []Token) (*Clause, error) { } } - clause.Statements[len(clause.Statements)-1].Value = StringValue{token.Value} + if prevToken.Type == TOK_OP_AP { + clause.Statements[len(clause.Statements)-1].Value = StringValue{"\"" + token.Value + "\""} + } else { + clause.Statements[len(clause.Statements)-1].Value = StringValue{token.Value} + } case TOK_VAL_DATETIME: if !prevToken.Type.isDateOperation() { return nil, &TokenError{ diff --git a/pkg/query/parser_test.go b/pkg/query/parser_test.go index 5c837c0..b1a98f1 100644 --- a/pkg/query/parser_test.go +++ b/pkg/query/parser_test.go @@ -53,7 +53,7 @@ func TestParse(t *testing.T) { &query.Clause{ Operator: query.COP_AND, Statements: []query.Statement{ - {Category: CAT_AUTHOR, Operator: OP_AP, Value: query.StringValue{"ken thompson"}}, + {Category: CAT_AUTHOR, Operator: OP_AP, Value: query.StringValue{"\"ken thompson\""}}, }, }, nil, @@ -70,7 +70,7 @@ func TestParse(t *testing.T) { &query.Clause{ Operator: query.COP_AND, Statements: []query.Statement{ - {Category: CAT_AUTHOR, Operator: OP_AP, Value: query.StringValue{"Alonzo Church"}}, + {Category: CAT_AUTHOR, Operator: OP_AP, Value: query.StringValue{"\"Alonzo Church\""}}, }, Clauses: []*query.Clause{ { diff --git a/pkg/shell/interpreter.go b/pkg/shell/interpreter.go index 287c28f..4c6bb03 100644 --- a/pkg/shell/interpreter.go +++ b/pkg/shell/interpreter.go @@ -88,6 +88,7 @@ var optimizations = []string{ "compact", "strictEq", "mergeregex", + "mergeap", } var commands = map[string]ITokType{ @@ -444,37 +445,41 @@ out: } o := query.NewOptimizer(clause, inter.Workers) - switch optName { - case "simplify": - o.Simplify() - case "tighten": - o.Tighten() - case "flatten": - o.Flatten() - case "sort": - o.SortStatements() - case "tidy": - o.Tidy() - case "contradictions": - o.Contradictions() - case "compact": - o.Compact() - case "strictEq": - o.StrictEquality() - case "mergeregex": - o.MergeRegex() - default: - suggestion, ok := util.Nearest( - optName, - inter.keywords.optimizations, - util.LevensteinDistance, - min(len(optName), 4), - ) - suggestionTxt := "" - if ok { - suggestionTxt = fmt.Sprintf(": Did you mean '%s'?", suggestion) + for curOpt := range strings.SplitSeq(optName, ",") { + switch curOpt { + case "simplify": + o.Simplify() + case "tighten": + o.Tighten() + case "flatten": + o.Flatten() + case "sort": + o.SortStatements() + case "tidy": + o.Tidy() + case "contradictions": + o.Contradictions() + case "compact": + o.Compact() + case "strictEq": + o.StrictEquality() + case "mergeregex": + o.MergeRegex() + case "mergeap": + o.MergeApproximateMatches() + default: + suggestion, ok := util.Nearest( + optName, + inter.keywords.optimizations, + util.LevensteinDistance, + min(len(optName), 4), + ) + suggestionTxt := "" + if ok { + suggestionTxt = fmt.Sprintf(": Did you mean '%s'?", suggestion) + } + return false, fmt.Errorf("Unrecognized optimization %s%s", t.Text, suggestionTxt) } - return false, fmt.Errorf("Unrecognized optimization %s%s", t.Text, suggestionTxt) } stack = append(stack, Value{VAL_CLAUSE, clause}) @@ -777,7 +782,7 @@ func PrintHelp(w io.Writer) { fmt.Fprintln(w, " ex. tokenize `author:me") fmt.Fprintln(w, "parse (tokens) - parse tokens into a clause") fmt.Fprintln(w, "optimize <level> (clause) - optimize clause tree to <level>") - fmt.Fprintln(w, "opt <subcommand> (clause) - apply specific optimization to clause tree") + fmt.Fprintln(w, "opt <subcommand1>,... (clause) - apply specific optimization(s) to clause tree") fmt.Fprintln(w, " sort - sort statements") fmt.Fprintln(w, " flatten - flatten clauses") fmt.Fprintln(w, " compact - compact equivalent statements") @@ -786,6 +791,7 @@ func PrintHelp(w io.Writer) { fmt.Fprintln(w, " strictEq - zero fuzzy/range statements when an eq is present") fmt.Fprintln(w, " tighten - zero redundant fuzzy/range statements when another mathes the same values") fmt.Fprintln(w, " mergeregex - merge regexes") + fmt.Fprintln(w, " mergeap - merge unordered approximate statements") fmt.Fprintln(w, "compile (clause) - compile clause into query") fmt.Fprintln(w, "execute (artifact) - excute the compiled query against the connected database") fmt.Fprintln(w, "\nBare commands which return a value assign to an implicit variable _") diff --git a/pkg/util/util.go b/pkg/util/util.go index b26b46a..7963cac 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -3,6 +3,7 @@ package util import ( "iter" "math" + "strings" "time" ) @@ -162,3 +163,10 @@ func Nearest[E any](candidate E, valid []E, cmp func(E, E) int, ceil int) (E, bo } return valid[minIdx], minDistance < ceil } + +// Check if substr[left:right] is a substring of S. +// If left > len(substr) use 0 +// If right < 0 use 0 +func ContainsSliced(s, substr string, left, right int) bool { + return strings.Contains(s, substr[min(left, len(substr)):max(right, 0)]) +} |
