7 files changed, 199 insertions, 33 deletions
diff --git a/pkg/query/compiler.go b/pkg/query/compiler.go
index 1e946a6..7000544 100644
--- a/pkg/query/compiler.go
+++ b/pkg/query/compiler.go
@@ -93,6 +93,8 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error
 			case OP_LT:
 				// NOTE: doesn't raise compiler error if operator used on invalid category
 				opStr = "< "
+			case OP_RE:
+				opStr = "REGEXP "
 			case OP_NE:
 				if cat.IsSet() {
 					opStr = "NOT IN "
@@ -164,6 +166,30 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error
 					idx++
 					sCount++
 				}
+			} else if op == OP_RE {
+				idx := 0
+				for _, stmt := range opStmts {
+					b.WriteString("( ")
+					b.WriteString(catStr)
+					b.WriteString("IS NOT NULL AND ")
+					if stmt.Negated {
+						b.WriteString("NOT ")
+					}
+					b.WriteString(catStr)
+					b.WriteString(opStr)
+					arg, ok := stmt.Value.buildCompile(b)
+					b.WriteString(" )")
+					if ok {
+						args = append(args, arg)
+					}
+					b.WriteByte(' ')
+					if idx != len(opStmts)-1 {
+						b.WriteString(delim)
+						b.WriteByte(' ')
+					}
+					idx++
+					sCount++
+				}
 			} else {
 				idx := 0
 				for _, stmt := range opStmts {
diff --git a/pkg/query/lexer.go b/pkg/query/lexer.go
index 79b6f0f..de344b7 100644
--- a/pkg/query/lexer.go
+++ b/pkg/query/lexer.go
@@ -29,7 +29,8 @@ const (
 	TOK_OP_LT  // less than
 	TOK_OP_LE  // less than or equal
 	TOK_OP_GE  // greater than or equal
-	TOK_OP_GT  // greaterthan
+	TOK_OP_GT  // greater than
+	TOK_OP_RE  // regex match
 	// categories
 	TOK_CAT_PATH
 	TOK_CAT_TITLE
@@ -67,6 +68,8 @@ func (tokType queryTokenType) String() string {
 		return "Equal"
 	case TOK_OP_AP:
 		return "Approximate"
+	case TOK_OP_RE:
+		return "Regular Expression"
 	case TOK_OP_NE:
 		return "Not Equal"
 	case TOK_OP_LT:
@@ -118,16 +121,18 @@ func (tokType queryTokenType) Any(expected ...queryTokenType) bool {
 	return slices.Contains(expected, tokType)
 }
 
-func (t queryTokenType) isClause() bool {
-	return t == TOK_CLAUSE_OR || t == TOK_CLAUSE_AND || t == TOK_CLAUSE_START || t == TOK_CLAUSE_END
-}
-
 func (t queryTokenType) isCategory() bool {
 	return t.Any(TOK_CAT_PATH, TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_DATE, TOK_CAT_FILETIME, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META)
 }
-func (t queryTokenType) isOperation() bool {
+
+func (t queryTokenType) isDateOperation() bool {
 	return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT)
 }
+
+func (t queryTokenType) isStringOperation() bool {
+	return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_RE)
+}
+
 func (t queryTokenType) isValue() bool {
 	return t == TOK_VAL_STR || t == TOK_VAL_DATETIME
 }
@@ -233,6 +238,8 @@ func tokenizeOperation(s string) Token {
 		t.Type = TOK_OP_LT
 	case ">":
 		t.Type = TOK_OP_GT
+	case "!re!":
+		t.Type = TOK_OP_RE
 	}
 
 	return t
@@ -321,7 +328,7 @@ func TokensStringify(tokens []Token) string {
 func init() {
 	negPattern := `(?<negation>-?)`
 	categoryPattern := `(?<category>T|p(?:ath)?|a(?:uthor)?|d(?:ate)?|f(?:iletime)?|t(?:ags|itle)?|l(?:inks)?|m(?:eta)?)`
-	opPattern := `(?<operator>!=|<=|>=|=|:|~|<|>)`
+	opPattern := `(?<operator>!re!|!=|<=|>=|=|:|~|<|>)`
 	valPattern := `(?<value>".*?"|\S*[^\s\)])`
 	statementPattern := `(?<statement>` + negPattern + categoryPattern + opPattern + valPattern + `)`
 	unknownPattern := `(?<unknown>\S*".*?"[^\s)]*|\S*[^\s\)])`
diff --git a/pkg/query/lexer_test.go b/pkg/query/lexer_test.go
index 5888902..35bdc4f 100644
--- a/pkg/query/lexer_test.go
+++ b/pkg/query/lexer_test.go
@@ -22,6 +22,7 @@ const (
 	TOK_OP_LE        = query.TOK_OP_LE
 	TOK_OP_GE        = query.TOK_OP_GE
 	TOK_OP_GT        = query.TOK_OP_GT
+	TOK_OP_RE        = query.TOK_OP_RE
 	TOK_CAT_TITLE    = query.TOK_CAT_TITLE
 	TOK_CAT_AUTHOR   = query.TOK_CAT_AUTHOR
 	TOK_CAT_DATE     = query.TOK_CAT_DATE
@@ -77,7 +78,7 @@ func TestLex(t *testing.T) {
 			{Type: TOK_CLAUSE_END},
 			{Type: TOK_CLAUSE_END},
 		}},
-		{"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo", []Token{
+		{"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo t!re!bar", []Token{
 			{Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
 			{TOK_CAT_AUTHOR, "a"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "a"},
 			{Type: TOK_CLAUSE_START}, {TOK_CLAUSE_OR, "or"},
@@ -88,6 +89,7 @@ func TestLex(t *testing.T) {
 			{TOK_CAT_DATE, "d"}, {TOK_OP_LE, "<="}, {TOK_VAL_DATETIME, "01010001"},
 			{Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
 			{TOK_OP_NEG, "-"}, {TOK_CAT_TITLE, "T"}, {TOK_OP_AP, "~"}, {TOK_VAL_STR, "foo"},
+			{TOK_CAT_TAGS, "t"}, {TOK_OP_RE, "!re!"}, {TOK_VAL_STR, "bar"},
 			{Type: TOK_CLAUSE_END},
 			{Type: TOK_CLAUSE_END},
 			{Type: TOK_CLAUSE_END},
diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go
index 3c435e8..2cc610a 100644
--- a/pkg/query/optimizer.go
+++ b/pkg/query/optimizer.go
@@ -1,6 +1,7 @@
 package query
 
 import (
+	"bytes"
 	"slices"
 	"strings"
 	"sync"
@@ -52,7 +53,6 @@ func (o Optimizer) Optimize(level int) {
 	if level < 0 {
 		return
 	} else if level == 0 {
-		// TODO: determine smarter level determination strategy
 		level = o.root.Depth()
 	}
 
@@ -63,6 +63,7 @@ func (o Optimizer) Optimize(level int) {
 		o.StrictEquality()
 		o.Tighten()
 		o.Contradictions()
+		o.MergeRegex()
 		// parallel + serial
 		o.Tidy()
 		// purely serial
@@ -326,6 +327,66 @@ func (o Optimizer) StrictEquality() {
 	})
 }
 
+// Merge regular within a clause
+func (o *Optimizer) MergeRegex() {
+	if !o.isSorted {
+		o.SortStatements()
+	}
+
+	pool := &sync.Pool{}
+	pool.New = func() any {
+		return &bytes.Buffer{}
+	}
+
+	o.parallel(func(c *Clause) {
+		if c.Operator != COP_OR {
+			return
+		}
+
+		buf := pool.Get().(*bytes.Buffer)
+		defer pool.Put(buf)
+		defer buf.Reset()
+		sortChanged := false
+		for _, catStmts := range c.Statements.CategoryPartition() {
+			for op, opStmts := range catStmts.OperatorPartition() {
+				if op != OP_RE {
+					continue
+				}
+
+				for _, stmts := range opStmts.NegatedPartition() {
+					if len(stmts) <= 1 {
+						continue
+					}
+					sortChanged = true
+
+					for i, stmt := range stmts {
+						if i == 0 {
+							buf.WriteByte('(')
+							buf.WriteString(stmt.Value.(StringValue).S)
+							buf.WriteByte('|')
+						} else if i == len(stmts)-1 {
+							buf.WriteString(stmt.Value.(StringValue).S)
+							buf.WriteByte(')')
+							stmts[i] = Statement{}
+						} else {
+							buf.WriteString(stmt.Value.(StringValue).S)
+							buf.WriteByte('|')
+							stmts[i] = Statement{}
+						}
+					}
+					stmts[0].Value = StringValue{S: buf.String()}
+					buf.Reset()
+				}
+
+				break
+			}
+		}
+		if sortChanged {
+			o.isSorted = false
+		}
+	})
+}
+
 // Shrink approximate statements and ranges
 //
 // Examples:
diff --git a/pkg/query/optimizer_test.go b/pkg/query/optimizer_test.go
index 1826c81..bb9c0ba 100644
--- a/pkg/query/optimizer_test.go
+++ b/pkg/query/optimizer_test.go
@@ -647,3 +647,75 @@ func TestOptimizer_Tighten(t *testing.T) {
 		})
 	}
 }
+
+func TestOptimizer_MergeRegex(t *testing.T) {
+	tests := []struct {
+		name string
+		c    *query.Clause
+		want query.Clause
+	}{
+		{
+			"only positve",
+			&query.Clause{
+				Operator: query.COP_OR,
+				Statements: []query.Statement{
+					{Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"a"}},
+					{Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"b"}},
+				},
+			},
+			query.Clause{
+				Operator: query.COP_OR,
+				Statements: []query.Statement{
+					{Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"(a|b)"}},
+				},
+			},
+		},
+		{
+			"multiple categories",
+			&query.Clause{
+				Operator: query.COP_OR,
+				Statements: []query.Statement{
+					{Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"a"}},
+					{Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"b"}},
+					{Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"c"}},
+					{Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"d"}},
+				},
+			},
+			query.Clause{
+				Operator: query.COP_OR,
+				Statements: []query.Statement{
+					{Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"(a|b)"}},
+					{Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(c|d)"}},
+				},
+			},
+		},
+		{
+			"mixed negations",
+			&query.Clause{
+				Operator: query.COP_OR,
+				Statements: []query.Statement{
+					{Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"a"}},
+					{Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"b"}},
+					{Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"c"}},
+					{Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"d"}},
+				},
+			},
+			query.Clause{
+				Operator: query.COP_OR,
+				Statements: []query.Statement{
+					{Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(a|b)"}},
+					{Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(c|d)"}},
+				},
+			},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			o := query.NewOptimizer(tt.c, WORKERS)
+			o.MergeRegex()
+			o.Tidy()
+
+			clauseEqTest(t, tt.c, &tt.want)
+		})
+	}
+}
diff --git a/pkg/query/parser.go b/pkg/query/parser.go
index 0b33a37..ce4ee7b 100644
--- a/pkg/query/parser.go
+++ b/pkg/query/parser.go
@@ -36,6 +36,7 @@ const (
 	OP_LE             // less than or equal
 	OP_GE             // greater than or equal
 	OP_GT             // greater than
+	OP_RE             // regular expresion
 )
 
 type clauseOperator int16
@@ -165,7 +166,7 @@ func (t catType) String() string {
 }
 
 func (t opType) IsFuzzy() bool {
-	return t == OP_AP || t.IsOrder()
+	return t == OP_AP || t == OP_RE || t.IsOrder()
 }
 
 func (t opType) IsOrder() bool {
@@ -188,6 +189,8 @@ func (t opType) String() string {
 		return "Greater Than or Equal"
 	case OP_GT:
 		return "Greater Than"
+	case OP_RE:
+		return "Regular Expression"
 	default:
 		return "Invalid"
 	}
@@ -234,6 +237,8 @@ func tokToOp(t queryTokenType) opType {
 		return OP_GE
 	case TOK_OP_GT:
 		return OP_GT
+	case TOK_OP_RE:
+		return OP_RE
 	default:
 		return OP_UNKNOWN
 	}
@@ -241,7 +246,7 @@ func tokToOp(t queryTokenType) opType {
 
 // Apply negation to a statements operator
 func (s *Statement) Simplify() {
-	if s.Negated && s.Operator != OP_AP {
+	if s.Negated && s.Operator != OP_AP && s.Operator != OP_RE {
 		s.Negated = false
 		switch s.Operator {
 		case OP_EQ:
@@ -324,23 +329,15 @@ func (s Statements) NegatedPartition() iter.Seq2[bool, Statements] {
 	}
 
 	return func(yield func(bool, Statements) bool) {
-		firstNegated := -1
-		for i, stmt := range s {
-			if stmt.Negated {
-				firstNegated = i
-			}
-		}
-
-		if firstNegated > 0 {
-			if !yield(false, s[:firstNegated]) {
-				return
-			} else if !yield(true, s[firstNegated:]) {
-				return
-			}
-		} else {
-			if !yield(false, s) {
-				return
-			}
+		firstNegated := slices.IndexFunc(s, func(stmt Statement) bool { return stmt.Negated })
+		if firstNegated == -1 {
+			yield(false, s)
+			return
+		} else if firstNegated == 0 {
+			yield(true, s)
+			return
+		} else if !yield(false, s[:firstNegated]) {
+		} else if !yield(true, s[firstNegated:]) {
 		}
 	}
 }
@@ -507,7 +504,7 @@ func Parse(tokens []Token) (*Clause, error) {
 				stmt := Statement{Category: tokToCat(token.Type)}
 				clause.Statements = append(clause.Statements, stmt)
 			}
-		case TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT:
+		case TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT, TOK_OP_RE:
 			if !prevToken.Type.isCategory() {
 				return nil, &TokenError{
 					got:      token,
@@ -518,21 +515,21 @@ func Parse(tokens []Token) (*Clause, error) {
 
 			clause.Statements[len(clause.Statements)-1].Operator = tokToOp(token.Type)
 		case TOK_VAL_STR:
-			if !prevToken.Type.isOperation() {
+			if !prevToken.Type.isStringOperation() {
 				return nil, &TokenError{
 					got:      token,
 					gotPrev:  prevToken,
-					wantPrev: "operation",
+					wantPrev: "string operation",
 				}
 			}
 
 			clause.Statements[len(clause.Statements)-1].Value = StringValue{token.Value}
 		case TOK_VAL_DATETIME:
-			if !prevToken.Type.isOperation() {
+			if !prevToken.Type.isDateOperation() {
 				return nil, &TokenError{
 					got:      token,
 					gotPrev:  prevToken,
-					wantPrev: "operation",
+					wantPrev: "date operation",
 				}
 			}
 
diff --git a/pkg/query/parser_test.go b/pkg/query/parser_test.go
index 5f68006..2e85123 100644
--- a/pkg/query/parser_test.go
+++ b/pkg/query/parser_test.go
@@ -26,6 +26,7 @@ const (
 	OP_LE      = query.OP_LE
 	OP_GE      = query.OP_GE
 	OP_GT      = query.OP_GT
+	OP_RE      = query.OP_RE
 )
 
 func TestParse(t *testing.T) {