4 files changed, 423 insertions, 43 deletions
diff --git a/pkg/query/lexer.go b/pkg/query/lexer.go
index 4d875fb..759b5bc 100644
--- a/pkg/query/lexer.go
+++ b/pkg/query/lexer.go
@@ -340,6 +340,6 @@ func init() {
 	LexRegexPattern = clausePattern
 
 	// FIXME: fails to match start of clauses with no values
-	//        ex: (and (or ... )) fails
+	//        example: (and (or ... )) fails
 	LexRegex = regexp.MustCompile(LexRegexPattern)
 }
diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go
index 46c2739..8b99a64 100644
--- a/pkg/query/optimizer.go
+++ b/pkg/query/optimizer.go
@@ -1,10 +1,19 @@
 package query
 
 import (
+	"iter"
 	"slices"
+	"strings"
+	"sync"
+
+	"github.com/jpappel/atlas/pkg/util"
 )
 
-type Optimizer struct{}
+type Optimizer struct {
+	workers  uint
+	root     *Clause
+	isSorted bool // current sort state of statement for all clauses
+}
 
 func StatementCmp(a Statement, b Statement) int {
 	catDiff := int(a.Category - b.Category)
@@ -16,7 +25,12 @@ func StatementCmp(a Statement, b Statement) int {
 		negatedDiff = -1
 	}
 
-	return catDiff*100_000 + opDiff*100 + negatedDiff*10 + a.Value.Compare(b.Value)
+	var valDiff int
+	if a.Value != nil && b.Value != nil {
+		valDiff = a.Value.Compare(b.Value)
+	}
+
+	return catDiff*100_000 + opDiff*100 + negatedDiff*10 + valDiff
 }
 
 func StatementEq(a Statement, b Statement) bool {
@@ -25,17 +39,97 @@ func StatementEq(a Statement, b Statement) bool {
 	return a.Category == b.Category && a.Operator == b.Operator && a.Negated == b.Negated && a.Value.Compare(b.Value) == 0
 }
 
-// Merge child clauses with their parents when applicable
-func (o Optimizer) Flatten(root *Clause) {
-	stack := make([]*Clause, 0, len(root.Clauses))
-	stack = append(stack, root)
+func NewOptimizer(root *Clause, workers uint) Optimizer {
+	return Optimizer{
+		root:    root,
+		workers: workers,
+	}
+}
+
+// Perform optimizations in parallel. They should **NOT** mutate the tree
+func (o Optimizer) parallel(optimize func(*Clause)) {
+	jobs := make(chan *Clause, o.workers)
+
+	wg := &sync.WaitGroup{}
+	wg.Add(int(o.workers))
+
+	for range o.workers {
+		go func(jobs <-chan *Clause, wg *sync.WaitGroup) {
+			for clause := range jobs {
+				optimize(clause)
+			}
+			wg.Done()
+		}(jobs, wg)
+	}
+
+	for clause := range o.root.DFS() {
+		jobs <- clause
+	}
+	close(jobs)
+	wg.Wait()
+}
+
+// Perform Optimizations serially. Only use this if the tree is being modified.
+// When modifying a clause set children that should not be explored to nil
+func (o *Optimizer) serial(optimize func(*Clause)) {
+	stack := make([]*Clause, 0, len(o.root.Clauses))
+	stack = append(stack, o.root)
 	for len(stack) != 0 {
 		top := len(stack) - 1
 		node := stack[top]
 		stack = stack[:top]
 
-		hasMerged := false
+		optimize(node)
 
+		node.Clauses = slices.DeleteFunc(node.Clauses, func(child *Clause) bool {
+			return child == nil
+		})
+		stack = append(stack, node.Clauses...)
+	}
+}
+
+// Partition statements by their category without copying (slices clause.Statements)
+func (o *Optimizer) partitionStatemements(clause *Clause) iter.Seq2[catType, Statements] {
+	return func(yield func(catType, Statements) bool) {
+		var category, lastCategory catType
+		var lastCategoryStart int
+		for i, stmt := range clause.Statements {
+			category = stmt.Category
+			if category != lastCategory {
+				if !yield(lastCategory, clause.Statements[lastCategoryStart:i]) {
+					return
+				}
+				lastCategoryStart = i
+			}
+			lastCategory = category
+		}
+
+		// handle leftover
+		if !yield(category, clause.Statements[lastCategoryStart:]) {
+			return
+		}
+	}
+}
+
+func (o *Optimizer) SortStatements() {
+	o.parallel(func(c *Clause) {
+		slices.SortFunc(c.Statements, StatementCmp)
+	})
+	o.isSorted = true
+}
+
+// Simplify all statements
+func (o *Optimizer) Simplify() {
+	o.parallel(func(c *Clause) {
+		for i := range c.Statements {
+			(&c.Statements[i]).Simplify()
+		}
+	})
+}
+
+// Merge child clauses with their parents when applicable
+func (o *Optimizer) Flatten() {
+	o.serial(func(node *Clause) {
 		// merge if only child clause
 		if len(node.Statements) == 0 && len(node.Clauses) == 1 {
 			child := node.Clauses[0]
@@ -48,53 +142,319 @@ func (o Optimizer) Flatten(root *Clause) {
 		// cannot be "modernized", node.Clauses is modified in loop
 		for i := 0; i < len(node.Clauses); i++ {
 			child := node.Clauses[i]
-
-			// merge because of commutativity
-			if node.Operator == child.Operator {
-				hasMerged = true
+			isSingleStmt := len(child.Clauses) == 0 && len(child.Statements) == 1
+			// merge because of commutativity or leaf node with single statement
+			if node.Operator == child.Operator || isSingleStmt {
 				node.Statements = append(node.Statements, child.Statements...)
 				node.Clauses = append(node.Clauses, child.Clauses...)
-			} else {
-				stack = append(stack, child)
+				node.Clauses[i] = nil
+			}
+		}
+	})
+}
+
+func (o *Optimizer) Compact() {
+	o.parallel(func(c *Clause) {
+		c.Statements = slices.CompactFunc(c.Statements, StatementEq)
+	})
+	o.isSorted = false
+}
+
+func (o *Optimizer) Tidy() {
+	// ensure ordering
+	if !o.isSorted {
+		o.SortStatements()
+	}
+
+	marked := make(map[*Clause]bool, 0)
+	markedLock := &sync.Mutex{}
+	// slice away noops
+	o.parallel(func(c *Clause) {
+		// PERF: should be benchmarked against binary seach, likely no performance gain
+		//       for typical length of Statements
+		start := slices.IndexFunc(c.Statements, func(s Statement) bool {
+			// NOTE: this breaks if valid categories exist between
+			//       CAT_UNKNOWN + CAT_TITLE or after CAT_META
+			return s.Category > CAT_UNKNOWN && s.Category <= CAT_META
+		})
+
+		// this means no valid categories in statements
+		if start == -1 {
+			c.Statements = nil
+			markedLock.Lock()
+			marked[c] = true
+			markedLock.Unlock()
+			return
+		}
+
+		stop := len(c.Statements)
+		for i := stop; i > 0; i-- {
+			// NOTE: this breaks if valid categories exist after CAT_META
+			if c.Statements[i-1].Category <= CAT_META {
+				stop = i
+				break
 			}
 		}
+		c.Statements = c.Statements[start:stop]
+	})
+
+	o.serial(func(c *Clause) {
+		for i, child := range c.Clauses {
+			if !marked[child] {
+				continue
+			}
 
-		if hasMerged {
-			numChildren := len(stack) - top
-			if numChildren > 0 {
-				node.Clauses = slices.Grow(node.Clauses, numChildren)
-				node.Clauses = node.Clauses[:numChildren]
-				copy(node.Clauses, stack[top:top+numChildren])
+			if c.Operator == COP_AND {
+				c.Statements = nil
+				c.Clauses = nil
+				break
 			} else {
-				node.Clauses = nil
+				c.Clauses[i] = nil
 			}
 		}
-	}
+	})
 }
 
-func (o Optimizer) Compact(c *Clause) {
-	for clause := range c.DFS() {
-		clause.Statements = slices.CompactFunc(clause.Statements, StatementEq)
-	}
+func inverseEq(s1, s2 Statement) bool {
+	s1.Negated = true
+	return StatementEq(s1, s2)
 }
 
-// if any claus is a strict equality/inequality noop all fuzzy operations
-func strictEquality(clause Clause) error {
-	isStrict := slices.ContainsFunc(clause.Statements, func(stmt Statement) bool {
-		if stmt.Operator == OP_EQ || stmt.Operator == OP_NE {
-			return true
+// Replace contradictions with noops
+func (o *Optimizer) Contradictions() {
+	if !o.isSorted {
+		o.SortStatements()
+	}
+
+	o.parallel(func(c *Clause) {
+		removals := make(map[int]bool, 8)
+		var isContradiction func(s1, s2 Statement) bool
+		for category, stmts := range o.partitionStatemements(c) {
+			if c.Operator == COP_AND && !category.IsSet() {
+				isContradiction = func(s1, s2 Statement) bool {
+					return (s1.Operator == OP_EQ && s1.Operator == s2.Operator) || inverseEq(s1, s2)
+				}
+			} else {
+				isContradiction = inverseEq
+			}
+			clear(removals)
+			for i := range stmts {
+				a := stmts[i]
+				a.Negated = !a.Negated
+				for j := i + 1; j < len(stmts); j++ {
+					b := stmts[j]
+					if isContradiction(a, b) {
+						removals[i] = true
+						removals[j] = true
+					}
+				}
+			}
+
+			for idx := range removals {
+				stmts[idx] = Statement{}
+			}
+			if len(removals) > 0 {
+				o.isSorted = false
+			}
 		}
-		return false
 	})
+}
+
+// Remove fuzzy/range based statements when possible.
+// Does not remove contradictions.
+//
+// Examples:
+//
+//	(and d="May 1, 1886" d>="January 1, 1880") --> (and d="May 1, 1886")
+//	(and T=notes T:"monday standup") --> (and T=notes)
+//	(and T="Meeting Notes" T:notes) --> (and T="Meeting Notes")
+//	(and a="Alonzo Church" a="Alan Turing" a:turing) --> (and a="Alonzo Church" a="Alan Turing")
+//	(and a="Alonzo Church" a="Alan Turing" a:djikstra) --> (and a="Alonzo Church" a="Alan Turing" a:djikstra)
+//	(and T=foo T=bar T:foobar) --> (and T=foo T=bar)
+func (o Optimizer) StrictEquality() {
+	if !o.isSorted {
+		o.SortStatements()
+	}
+	o.parallel(func(c *Clause) {
+		if c.Operator != COP_AND {
+			return
+		}
 
-	if isStrict {
-		for i := range clause.Statements {
-			stmt := clause.Statements[i]
-			if stmt.Operator != OP_EQ && stmt.Operator != OP_NE {
-				clause.Statements[i] = Statement{}
+		stricts := make([]string, 0)
+		for category, stmts := range o.partitionStatemements(c) {
+			if category.IsSet() {
+				clear(stricts)
+				for i, s := range stmts {
+					val := strings.ToLower(s.Value.(StringValue).S)
+					if s.Operator == OP_EQ {
+						stricts = append(stricts, val)
+					} else if s.Operator == OP_AP {
+						if slices.ContainsFunc(stricts, func(strictStr string) bool {
+							return strings.Contains(strictStr, val) || strings.Contains(val, strictStr)
+						}) {
+							stmts[i] = Statement{}
+							o.isSorted = false
+						}
+					}
+				}
+			} else {
+				hasEq := false
+				for i, s := range stmts {
+					hasEq = hasEq || (s.Operator == OP_EQ)
+					if hasEq && s.Operator != OP_EQ {
+						stmts[i] = Statement{}
+						o.isSorted = false
+					}
+				}
 			}
 		}
+	})
+}
+
+// Shrink approximate statements and ranges
+//
+// Examples:
+//
+//	(or d>"2025-01-01 d>"2025-02-02") --> (or d>"2025-01-01")
+//	(and d>"2025-01-01 d>"2025-02-02") --> (and d>"2025-02-02")
+//	(or T:"Das Kapital I" T:"Das Kapital") --> (and T:"Das Kapital")
+//	(and T:"Das Kapital I" T:"Das Kapital") --> (and T:"Das Kapital I")
+func (o *Optimizer) Tighten() {
+	if !o.isSorted {
+		o.SortStatements()
 	}
 
-	return nil
+	o.parallel(func(c *Clause) {
+		for category, stmts := range o.partitionStatemements(c) {
+			if len(stmts) < 2 {
+				continue
+			}
+			if c.Operator == COP_AND {
+				if category.IsOrdered() {
+					minLT, minLE := -1, -1
+					maxGT, maxGE := -1, -1
+					for i, s := range stmts {
+						if s.Operator == OP_LT && minLT == -1 {
+							minLT = i
+						} else if s.Operator == OP_LE && minLE == -1 {
+							minLE = i
+						} else if s.Operator == OP_GE {
+							maxGE = i
+						} else if s.Operator == OP_GT {
+							maxGT = i
+						}
+					}
+
+					lowerIdx, upperIdx := -1, -1
+					if minLT != -1 && minLE != -1 {
+						ltStmt := stmts[minLT]
+						leStmt := stmts[minLE]
+						leDate := leStmt.Value.(DatetimeValue).D
+						ltDate := ltStmt.Value.(DatetimeValue).D
+
+						if ltDate.After(leDate) {
+							upperIdx = minLE
+						} else {
+							upperIdx = minLT
+						}
+					} else if minLT != -1 {
+						upperIdx = minLT
+					} else if minLE != -1 {
+						upperIdx = minLE
+					}
+					if maxGT != -1 && maxGE != -1 {
+						gtStmt := stmts[maxGT]
+						geStmt := stmts[maxGE]
+						geDate := geStmt.Value.(DatetimeValue).D
+						gtDate := gtStmt.Value.(DatetimeValue).D
+
+						if geDate.After(gtDate) {
+							lowerIdx = maxGE
+						} else {
+							lowerIdx = maxGT
+						}
+					} else if maxGT != -1 {
+						lowerIdx = maxGT
+					} else if maxGE != -1 {
+						lowerIdx = maxGE
+					}
+
+					for i, s := range stmts {
+						if !s.Operator.IsOrder() || i == lowerIdx || i == upperIdx {
+							continue
+						}
+
+						stmts[i] = Statement{}
+					}
+				} else {
+					removals := make(map[int]bool)
+					for i, s1 := range util.FilterIter(stmts, func(s Statement) bool { return s.Operator == OP_AP }) {
+						val1 := strings.ToLower(s1.Value.(StringValue).S)
+						for j, s2 := range util.FilterIter(stmts[i+1:], func(s Statement) bool { return s.Operator == OP_AP }) {
+							val2 := strings.ToLower(s2.Value.(StringValue).S)
+							if strings.Contains(val2, val1) {
+								removals[i] = true
+							} else if strings.Contains(val1, val2) {
+								removals[j] = true
+							}
+						}
+					}
+					for idx := range removals {
+						stmts[idx] = Statement{}
+					}
+					if len(removals) > 0 {
+						o.isSorted = false
+					}
+				}
+			} else {
+				if category.IsOrdered() {
+					// NOTE: doesn't handle fuzzy dates
+					minIdx := slices.IndexFunc(stmts, func(s Statement) bool {
+						return s.Operator.IsOrder()
+					})
+					maxIdx := len(stmts) - 1
+					for i, s := range slices.Backward(stmts) {
+						if s.Operator.IsOrder() {
+							maxIdx = i
+							break
+						}
+					}
+					if minIdx != -1 {
+						start, stop := minIdx, maxIdx
+						if minS := stmts[minIdx]; minS.Operator == OP_GE || minS.Operator == OP_GT {
+							start++
+						}
+						if maxS := stmts[maxIdx]; maxS.Operator == OP_LT || maxS.Operator == OP_LE {
+							stop--
+						}
+						for i := start; i <= stop; i++ {
+							stmts[i] = Statement{}
+						}
+					}
+				} else {
+					// NOTE: this has to be all pairs for correctness,
+					//       but it doesn't have to be this sloppy...... :|
+					removals := make(map[int]bool)
+					for i, s1 := range util.FilterIter(stmts, func(s Statement) bool { return s.Operator == OP_AP }) {
+						val1 := strings.ToLower(s1.Value.(StringValue).S)
+						for j, s2 := range util.FilterIter(stmts[i+1:], func(s Statement) bool { return s.Operator == OP_AP }) {
+							val2 := strings.ToLower(s2.Value.(StringValue).S)
+							if strings.Contains(val2, val1) {
+								removals[j] = true
+							} else if strings.Contains(val1, val2) {
+								removals[i] = true
+							}
+						}
+					}
+
+					for idx := range removals {
+						stmts[idx] = Statement{}
+					}
+					if len(removals) > 0 {
+						o.isSorted = false
+					}
+				}
+			}
+		}
+	})
 }
diff --git a/pkg/query/optimizer_test.go b/pkg/query/optimizer_test.go
index 4de3a12..00710f6 100644
--- a/pkg/query/optimizer_test.go
+++ b/pkg/query/optimizer_test.go
@@ -1,6 +1,7 @@
 package query_test
 
 import (
+	"runtime"
 	"slices"
 	"testing"
 
@@ -163,9 +164,10 @@ func TestClause_Flatten(t *testing.T) {
 		},
 	}
 	for _, tt := range tests {
-		o := query.Optimizer{}
+		workers := uint(runtime.NumCPU())
 		t.Run(tt.name, func(t *testing.T) {
-			o.Flatten(tt.root)
+			o := query.NewOptimizer(tt.root, workers)
+			o.Flatten()
 
 			slices.SortFunc(tt.root.Statements, query.StatementCmp)
 			slices.SortFunc(tt.expected.Statements, query.StatementCmp)
@@ -256,9 +258,10 @@ func TestOptimizer_Compact(t *testing.T) {
 		},
 	}
 	for _, tt := range tests {
-		o := query.Optimizer{}
+		workers := uint(runtime.NumCPU())
 		t.Run(tt.name, func(t *testing.T) {
-			o.Compact(tt.c)
+			o := query.NewOptimizer(tt.c, workers)
+			o.Compact()
 			got := slices.Collect(tt.c.DFS())
 			want := slices.Collect(tt.want.DFS())
 
diff --git a/pkg/query/parser.go b/pkg/query/parser.go
index 14cc227..2bfc014 100644
--- a/pkg/query/parser.go
+++ b/pkg/query/parser.go
@@ -28,8 +28,8 @@ type opType int
 const (
 	OP_UNKNOWN opType = iota
 	OP_EQ             // equal
-	OP_AP             // approximate/fuzzy
 	OP_NE             // not equal
+	OP_AP             // approximate/fuzzy
 	OP_LT             // less than
 	OP_LE             // less than or equal
 	OP_GE             // greater than or equal
@@ -122,6 +122,15 @@ func (v DatetimeValue) Compare(other Valuer) int {
 var _ Valuer = StringValue{}
 var _ Valuer = DatetimeValue{}
 
+// Return if OP_EQ behaves like set membership
+func (t catType) IsSet() bool {
+	return t == CAT_TAGS || t == CAT_AUTHOR || t == CAT_LINKS
+}
+
+func (t catType) IsOrdered() bool {
+	return t == CAT_DATE || t == CAT_FILETIME
+}
+
 func (t catType) String() string {
 	switch t {
 	case CAT_TITLE:
@@ -143,6 +152,14 @@ func (t catType) String() string {
 	}
 }
 
+func (t opType) IsFuzzy() bool {
+	return t == OP_AP || t.IsOrder()
+}
+
+func (t opType) IsOrder() bool {
+	return t == OP_LT || t == OP_LE || t == OP_GT || t == OP_GE
+}
+
 func (t opType) String() string {
 	switch t {
 	case OP_EQ: