diff options
Diffstat (limited to 'pkg/query')
| -rw-r--r-- | pkg/query/lexer.go | 2 | ||||
| -rw-r--r-- | pkg/query/optimizer.go | 434 | ||||
| -rw-r--r-- | pkg/query/optimizer_test.go | 11 | ||||
| -rw-r--r-- | pkg/query/parser.go | 19 |
4 files changed, 423 insertions, 43 deletions
diff --git a/pkg/query/lexer.go b/pkg/query/lexer.go index 4d875fb..759b5bc 100644 --- a/pkg/query/lexer.go +++ b/pkg/query/lexer.go @@ -340,6 +340,6 @@ func init() { LexRegexPattern = clausePattern // FIXME: fails to match start of clauses with no values - // ex: (and (or ... )) fails + // example: (and (or ... )) fails LexRegex = regexp.MustCompile(LexRegexPattern) } diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go index 46c2739..8b99a64 100644 --- a/pkg/query/optimizer.go +++ b/pkg/query/optimizer.go @@ -1,10 +1,19 @@ package query import ( + "iter" "slices" + "strings" + "sync" + + "github.com/jpappel/atlas/pkg/util" ) -type Optimizer struct{} +type Optimizer struct { + workers uint + root *Clause + isSorted bool // current sort state of statement for all clauses +} func StatementCmp(a Statement, b Statement) int { catDiff := int(a.Category - b.Category) @@ -16,7 +25,12 @@ func StatementCmp(a Statement, b Statement) int { negatedDiff = -1 } - return catDiff*100_000 + opDiff*100 + negatedDiff*10 + a.Value.Compare(b.Value) + var valDiff int + if a.Value != nil && b.Value != nil { + valDiff = a.Value.Compare(b.Value) + } + + return catDiff*100_000 + opDiff*100 + negatedDiff*10 + valDiff } func StatementEq(a Statement, b Statement) bool { @@ -25,17 +39,97 @@ func StatementEq(a Statement, b Statement) bool { return a.Category == b.Category && a.Operator == b.Operator && a.Negated == b.Negated && a.Value.Compare(b.Value) == 0 } -// Merge child clauses with their parents when applicable -func (o Optimizer) Flatten(root *Clause) { - stack := make([]*Clause, 0, len(root.Clauses)) - stack = append(stack, root) +func NewOptimizer(root *Clause, workers uint) Optimizer { + return Optimizer{ + root: root, + workers: workers, + } +} + +// Perform optimizations in parallel. They should **NOT** mutate the tree +func (o Optimizer) parallel(optimize func(*Clause)) { + jobs := make(chan *Clause, o.workers) + + wg := &sync.WaitGroup{} + wg.Add(int(o.workers)) + + for range o.workers { + go func(jobs <-chan *Clause, wg *sync.WaitGroup) { + for clause := range jobs { + optimize(clause) + } + wg.Done() + }(jobs, wg) + } + + for clause := range o.root.DFS() { + jobs <- clause + } + close(jobs) + wg.Wait() +} + +// Perform Optimizations serially. Only use this if the tree is being modified. +// When modifying a clause set children that should not be explored to nil +func (o *Optimizer) serial(optimize func(*Clause)) { + stack := make([]*Clause, 0, len(o.root.Clauses)) + stack = append(stack, o.root) for len(stack) != 0 { top := len(stack) - 1 node := stack[top] stack = stack[:top] - hasMerged := false + optimize(node) + node.Clauses = slices.DeleteFunc(node.Clauses, func(child *Clause) bool { + return child == nil + }) + stack = append(stack, node.Clauses...) + } +} + +// Partition statements by their category without copying (slices clause.Statements) +func (o *Optimizer) partitionStatemements(clause *Clause) iter.Seq2[catType, Statements] { + return func(yield func(catType, Statements) bool) { + var category, lastCategory catType + var lastCategoryStart int + for i, stmt := range clause.Statements { + category = stmt.Category + if category != lastCategory { + if !yield(lastCategory, clause.Statements[lastCategoryStart:i]) { + return + } + lastCategoryStart = i + } + lastCategory = category + } + + // handle leftover + if !yield(category, clause.Statements[lastCategoryStart:]) { + return + } + } +} + +func (o *Optimizer) SortStatements() { + o.parallel(func(c *Clause) { + slices.SortFunc(c.Statements, StatementCmp) + }) + o.isSorted = true +} + +// Simplify all statements +func (o *Optimizer) Simplify() { + o.parallel(func(c *Clause) { + for i := range c.Statements { + (&c.Statements[i]).Simplify() + } + }) +} + +// Merge child clauses with their parents when applicable +func (o *Optimizer) Flatten() { + o.serial(func(node *Clause) { // merge if only child clause if len(node.Statements) == 0 && len(node.Clauses) == 1 { child := node.Clauses[0] @@ -48,53 +142,319 @@ func (o Optimizer) Flatten(root *Clause) { // cannot be "modernized", node.Clauses is modified in loop for i := 0; i < len(node.Clauses); i++ { child := node.Clauses[i] - - // merge because of commutativity - if node.Operator == child.Operator { - hasMerged = true + isSingleStmt := len(child.Clauses) == 0 && len(child.Statements) == 1 + // merge because of commutativity or leaf node with single statement + if node.Operator == child.Operator || isSingleStmt { node.Statements = append(node.Statements, child.Statements...) node.Clauses = append(node.Clauses, child.Clauses...) - } else { - stack = append(stack, child) + node.Clauses[i] = nil + } + } + }) +} + +func (o *Optimizer) Compact() { + o.parallel(func(c *Clause) { + c.Statements = slices.CompactFunc(c.Statements, StatementEq) + }) + o.isSorted = false +} + +func (o *Optimizer) Tidy() { + // ensure ordering + if !o.isSorted { + o.SortStatements() + } + + marked := make(map[*Clause]bool, 0) + markedLock := &sync.Mutex{} + // slice away noops + o.parallel(func(c *Clause) { + // PERF: should be benchmarked against binary seach, likely no performance gain + // for typical length of Statements + start := slices.IndexFunc(c.Statements, func(s Statement) bool { + // NOTE: this breaks if valid categories exist between + // CAT_UNKNOWN + CAT_TITLE or after CAT_META + return s.Category > CAT_UNKNOWN && s.Category <= CAT_META + }) + + // this means no valid categories in statements + if start == -1 { + c.Statements = nil + markedLock.Lock() + marked[c] = true + markedLock.Unlock() + return + } + + stop := len(c.Statements) + for i := stop; i > 0; i-- { + // NOTE: this breaks if valid categories exist after CAT_META + if c.Statements[i-1].Category <= CAT_META { + stop = i + break } } + c.Statements = c.Statements[start:stop] + }) + + o.serial(func(c *Clause) { + for i, child := range c.Clauses { + if !marked[child] { + continue + } - if hasMerged { - numChildren := len(stack) - top - if numChildren > 0 { - node.Clauses = slices.Grow(node.Clauses, numChildren) - node.Clauses = node.Clauses[:numChildren] - copy(node.Clauses, stack[top:top+numChildren]) + if c.Operator == COP_AND { + c.Statements = nil + c.Clauses = nil + break } else { - node.Clauses = nil + c.Clauses[i] = nil } } - } + }) } -func (o Optimizer) Compact(c *Clause) { - for clause := range c.DFS() { - clause.Statements = slices.CompactFunc(clause.Statements, StatementEq) - } +func inverseEq(s1, s2 Statement) bool { + s1.Negated = true + return StatementEq(s1, s2) } -// if any claus is a strict equality/inequality noop all fuzzy operations -func strictEquality(clause Clause) error { - isStrict := slices.ContainsFunc(clause.Statements, func(stmt Statement) bool { - if stmt.Operator == OP_EQ || stmt.Operator == OP_NE { - return true +// Replace contradictions with noops +func (o *Optimizer) Contradictions() { + if !o.isSorted { + o.SortStatements() + } + + o.parallel(func(c *Clause) { + removals := make(map[int]bool, 8) + var isContradiction func(s1, s2 Statement) bool + for category, stmts := range o.partitionStatemements(c) { + if c.Operator == COP_AND && !category.IsSet() { + isContradiction = func(s1, s2 Statement) bool { + return (s1.Operator == OP_EQ && s1.Operator == s2.Operator) || inverseEq(s1, s2) + } + } else { + isContradiction = inverseEq + } + clear(removals) + for i := range stmts { + a := stmts[i] + a.Negated = !a.Negated + for j := i + 1; j < len(stmts); j++ { + b := stmts[j] + if isContradiction(a, b) { + removals[i] = true + removals[j] = true + } + } + } + + for idx := range removals { + stmts[idx] = Statement{} + } + if len(removals) > 0 { + o.isSorted = false + } } - return false }) +} + +// Remove fuzzy/range based statements when possible. +// Does not remove contradictions. +// +// Examples: +// +// (and d="May 1, 1886" d>="January 1, 1880") --> (and d="May 1, 1886") +// (and T=notes T:"monday standup") --> (and T=notes) +// (and T="Meeting Notes" T:notes) --> (and T="Meeting Notes") +// (and a="Alonzo Church" a="Alan Turing" a:turing) --> (and a="Alonzo Church" a="Alan Turing") +// (and a="Alonzo Church" a="Alan Turing" a:djikstra) --> (and a="Alonzo Church" a="Alan Turing" a:djikstra) +// (and T=foo T=bar T:foobar) --> (and T=foo T=bar) +func (o Optimizer) StrictEquality() { + if !o.isSorted { + o.SortStatements() + } + o.parallel(func(c *Clause) { + if c.Operator != COP_AND { + return + } - if isStrict { - for i := range clause.Statements { - stmt := clause.Statements[i] - if stmt.Operator != OP_EQ && stmt.Operator != OP_NE { - clause.Statements[i] = Statement{} + stricts := make([]string, 0) + for category, stmts := range o.partitionStatemements(c) { + if category.IsSet() { + clear(stricts) + for i, s := range stmts { + val := strings.ToLower(s.Value.(StringValue).S) + if s.Operator == OP_EQ { + stricts = append(stricts, val) + } else if s.Operator == OP_AP { + if slices.ContainsFunc(stricts, func(strictStr string) bool { + return strings.Contains(strictStr, val) || strings.Contains(val, strictStr) + }) { + stmts[i] = Statement{} + o.isSorted = false + } + } + } + } else { + hasEq := false + for i, s := range stmts { + hasEq = hasEq || (s.Operator == OP_EQ) + if hasEq && s.Operator != OP_EQ { + stmts[i] = Statement{} + o.isSorted = false + } + } } } + }) +} + +// Shrink approximate statements and ranges +// +// Examples: +// +// (or d>"2025-01-01 d>"2025-02-02") --> (or d>"2025-01-01") +// (and d>"2025-01-01 d>"2025-02-02") --> (and d>"2025-02-02") +// (or T:"Das Kapital I" T:"Das Kapital") --> (and T:"Das Kapital") +// (and T:"Das Kapital I" T:"Das Kapital") --> (and T:"Das Kapital I") +func (o *Optimizer) Tighten() { + if !o.isSorted { + o.SortStatements() } - return nil + o.parallel(func(c *Clause) { + for category, stmts := range o.partitionStatemements(c) { + if len(stmts) < 2 { + continue + } + if c.Operator == COP_AND { + if category.IsOrdered() { + minLT, minLE := -1, -1 + maxGT, maxGE := -1, -1 + for i, s := range stmts { + if s.Operator == OP_LT && minLT == -1 { + minLT = i + } else if s.Operator == OP_LE && minLE == -1 { + minLE = i + } else if s.Operator == OP_GE { + maxGE = i + } else if s.Operator == OP_GT { + maxGT = i + } + } + + lowerIdx, upperIdx := -1, -1 + if minLT != -1 && minLE != -1 { + ltStmt := stmts[minLT] + leStmt := stmts[minLE] + leDate := leStmt.Value.(DatetimeValue).D + ltDate := ltStmt.Value.(DatetimeValue).D + + if ltDate.After(leDate) { + upperIdx = minLE + } else { + upperIdx = minLT + } + } else if minLT != -1 { + upperIdx = minLT + } else if minLE != -1 { + upperIdx = minLE + } + if maxGT != -1 && maxGE != -1 { + gtStmt := stmts[maxGT] + geStmt := stmts[maxGE] + geDate := geStmt.Value.(DatetimeValue).D + gtDate := gtStmt.Value.(DatetimeValue).D + + if geDate.After(gtDate) { + lowerIdx = maxGE + } else { + lowerIdx = maxGT + } + } else if maxGT != -1 { + lowerIdx = maxGT + } else if maxGE != -1 { + lowerIdx = maxGE + } + + for i, s := range stmts { + if !s.Operator.IsOrder() || i == lowerIdx || i == upperIdx { + continue + } + + stmts[i] = Statement{} + } + } else { + removals := make(map[int]bool) + for i, s1 := range util.FilterIter(stmts, func(s Statement) bool { return s.Operator == OP_AP }) { + val1 := strings.ToLower(s1.Value.(StringValue).S) + for j, s2 := range util.FilterIter(stmts[i+1:], func(s Statement) bool { return s.Operator == OP_AP }) { + val2 := strings.ToLower(s2.Value.(StringValue).S) + if strings.Contains(val2, val1) { + removals[i] = true + } else if strings.Contains(val1, val2) { + removals[j] = true + } + } + } + for idx := range removals { + stmts[idx] = Statement{} + } + if len(removals) > 0 { + o.isSorted = false + } + } + } else { + if category.IsOrdered() { + // NOTE: doesn't handle fuzzy dates + minIdx := slices.IndexFunc(stmts, func(s Statement) bool { + return s.Operator.IsOrder() + }) + maxIdx := len(stmts) - 1 + for i, s := range slices.Backward(stmts) { + if s.Operator.IsOrder() { + maxIdx = i + break + } + } + if minIdx != -1 { + start, stop := minIdx, maxIdx + if minS := stmts[minIdx]; minS.Operator == OP_GE || minS.Operator == OP_GT { + start++ + } + if maxS := stmts[maxIdx]; maxS.Operator == OP_LT || maxS.Operator == OP_LE { + stop-- + } + for i := start; i <= stop; i++ { + stmts[i] = Statement{} + } + } + } else { + // NOTE: this has to be all pairs for correctness, + // but it doesn't have to be this sloppy...... :| + removals := make(map[int]bool) + for i, s1 := range util.FilterIter(stmts, func(s Statement) bool { return s.Operator == OP_AP }) { + val1 := strings.ToLower(s1.Value.(StringValue).S) + for j, s2 := range util.FilterIter(stmts[i+1:], func(s Statement) bool { return s.Operator == OP_AP }) { + val2 := strings.ToLower(s2.Value.(StringValue).S) + if strings.Contains(val2, val1) { + removals[j] = true + } else if strings.Contains(val1, val2) { + removals[i] = true + } + } + } + + for idx := range removals { + stmts[idx] = Statement{} + } + if len(removals) > 0 { + o.isSorted = false + } + } + } + } + }) } diff --git a/pkg/query/optimizer_test.go b/pkg/query/optimizer_test.go index 4de3a12..00710f6 100644 --- a/pkg/query/optimizer_test.go +++ b/pkg/query/optimizer_test.go @@ -1,6 +1,7 @@ package query_test import ( + "runtime" "slices" "testing" @@ -163,9 +164,10 @@ func TestClause_Flatten(t *testing.T) { }, } for _, tt := range tests { - o := query.Optimizer{} + workers := uint(runtime.NumCPU()) t.Run(tt.name, func(t *testing.T) { - o.Flatten(tt.root) + o := query.NewOptimizer(tt.root, workers) + o.Flatten() slices.SortFunc(tt.root.Statements, query.StatementCmp) slices.SortFunc(tt.expected.Statements, query.StatementCmp) @@ -256,9 +258,10 @@ func TestOptimizer_Compact(t *testing.T) { }, } for _, tt := range tests { - o := query.Optimizer{} + workers := uint(runtime.NumCPU()) t.Run(tt.name, func(t *testing.T) { - o.Compact(tt.c) + o := query.NewOptimizer(tt.c, workers) + o.Compact() got := slices.Collect(tt.c.DFS()) want := slices.Collect(tt.want.DFS()) diff --git a/pkg/query/parser.go b/pkg/query/parser.go index 14cc227..2bfc014 100644 --- a/pkg/query/parser.go +++ b/pkg/query/parser.go @@ -28,8 +28,8 @@ type opType int const ( OP_UNKNOWN opType = iota OP_EQ // equal - OP_AP // approximate/fuzzy OP_NE // not equal + OP_AP // approximate/fuzzy OP_LT // less than OP_LE // less than or equal OP_GE // greater than or equal @@ -122,6 +122,15 @@ func (v DatetimeValue) Compare(other Valuer) int { var _ Valuer = StringValue{} var _ Valuer = DatetimeValue{} +// Return if OP_EQ behaves like set membership +func (t catType) IsSet() bool { + return t == CAT_TAGS || t == CAT_AUTHOR || t == CAT_LINKS +} + +func (t catType) IsOrdered() bool { + return t == CAT_DATE || t == CAT_FILETIME +} + func (t catType) String() string { switch t { case CAT_TITLE: @@ -143,6 +152,14 @@ func (t catType) String() string { } } +func (t opType) IsFuzzy() bool { + return t == OP_AP || t.IsOrder() +} + +func (t opType) IsOrder() bool { + return t == OP_LT || t == OP_LE || t == OP_GT || t == OP_GE +} + func (t opType) String() string { switch t { case OP_EQ: |
