aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/query
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/query')
-rw-r--r--pkg/query/compiler.go26
-rw-r--r--pkg/query/lexer.go21
-rw-r--r--pkg/query/lexer_test.go4
-rw-r--r--pkg/query/optimizer.go63
-rw-r--r--pkg/query/optimizer_test.go72
-rw-r--r--pkg/query/parser.go45
-rw-r--r--pkg/query/parser_test.go1
7 files changed, 199 insertions, 33 deletions
diff --git a/pkg/query/compiler.go b/pkg/query/compiler.go
index 1e946a6..7000544 100644
--- a/pkg/query/compiler.go
+++ b/pkg/query/compiler.go
@@ -93,6 +93,8 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error
case OP_LT:
// NOTE: doesn't raise compiler error if operator used on invalid category
opStr = "< "
+ case OP_RE:
+ opStr = "REGEXP "
case OP_NE:
if cat.IsSet() {
opStr = "NOT IN "
@@ -164,6 +166,30 @@ func (s Statements) buildCompile(b *strings.Builder, delim string) ([]any, error
idx++
sCount++
}
+ } else if op == OP_RE {
+ idx := 0
+ for _, stmt := range opStmts {
+ b.WriteString("( ")
+ b.WriteString(catStr)
+ b.WriteString("IS NOT NULL AND ")
+ if stmt.Negated {
+ b.WriteString("NOT ")
+ }
+ b.WriteString(catStr)
+ b.WriteString(opStr)
+ arg, ok := stmt.Value.buildCompile(b)
+ b.WriteString(" )")
+ if ok {
+ args = append(args, arg)
+ }
+ b.WriteByte(' ')
+ if idx != len(opStmts)-1 {
+ b.WriteString(delim)
+ b.WriteByte(' ')
+ }
+ idx++
+ sCount++
+ }
} else {
idx := 0
for _, stmt := range opStmts {
diff --git a/pkg/query/lexer.go b/pkg/query/lexer.go
index 79b6f0f..de344b7 100644
--- a/pkg/query/lexer.go
+++ b/pkg/query/lexer.go
@@ -29,7 +29,8 @@ const (
TOK_OP_LT // less than
TOK_OP_LE // less than or equal
TOK_OP_GE // greater than or equal
- TOK_OP_GT // greaterthan
+ TOK_OP_GT // greater than
+ TOK_OP_RE // regex match
// categories
TOK_CAT_PATH
TOK_CAT_TITLE
@@ -67,6 +68,8 @@ func (tokType queryTokenType) String() string {
return "Equal"
case TOK_OP_AP:
return "Approximate"
+ case TOK_OP_RE:
+ return "Regular Expression"
case TOK_OP_NE:
return "Not Equal"
case TOK_OP_LT:
@@ -118,16 +121,18 @@ func (tokType queryTokenType) Any(expected ...queryTokenType) bool {
return slices.Contains(expected, tokType)
}
-func (t queryTokenType) isClause() bool {
- return t == TOK_CLAUSE_OR || t == TOK_CLAUSE_AND || t == TOK_CLAUSE_START || t == TOK_CLAUSE_END
-}
-
func (t queryTokenType) isCategory() bool {
return t.Any(TOK_CAT_PATH, TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_DATE, TOK_CAT_FILETIME, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META)
}
-func (t queryTokenType) isOperation() bool {
+
+func (t queryTokenType) isDateOperation() bool {
return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT)
}
+
+func (t queryTokenType) isStringOperation() bool {
+ return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_RE)
+}
+
func (t queryTokenType) isValue() bool {
return t == TOK_VAL_STR || t == TOK_VAL_DATETIME
}
@@ -233,6 +238,8 @@ func tokenizeOperation(s string) Token {
t.Type = TOK_OP_LT
case ">":
t.Type = TOK_OP_GT
+ case "!re!":
+ t.Type = TOK_OP_RE
}
return t
@@ -321,7 +328,7 @@ func TokensStringify(tokens []Token) string {
func init() {
negPattern := `(?<negation>-?)`
categoryPattern := `(?<category>T|p(?:ath)?|a(?:uthor)?|d(?:ate)?|f(?:iletime)?|t(?:ags|itle)?|l(?:inks)?|m(?:eta)?)`
- opPattern := `(?<operator>!=|<=|>=|=|:|~|<|>)`
+ opPattern := `(?<operator>!re!|!=|<=|>=|=|:|~|<|>)`
valPattern := `(?<value>".*?"|\S*[^\s\)])`
statementPattern := `(?<statement>` + negPattern + categoryPattern + opPattern + valPattern + `)`
unknownPattern := `(?<unknown>\S*".*?"[^\s)]*|\S*[^\s\)])`
diff --git a/pkg/query/lexer_test.go b/pkg/query/lexer_test.go
index 5888902..35bdc4f 100644
--- a/pkg/query/lexer_test.go
+++ b/pkg/query/lexer_test.go
@@ -22,6 +22,7 @@ const (
TOK_OP_LE = query.TOK_OP_LE
TOK_OP_GE = query.TOK_OP_GE
TOK_OP_GT = query.TOK_OP_GT
+ TOK_OP_RE = query.TOK_OP_RE
TOK_CAT_TITLE = query.TOK_CAT_TITLE
TOK_CAT_AUTHOR = query.TOK_CAT_AUTHOR
TOK_CAT_DATE = query.TOK_CAT_DATE
@@ -77,7 +78,7 @@ func TestLex(t *testing.T) {
{Type: TOK_CLAUSE_END},
{Type: TOK_CLAUSE_END},
}},
- {"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo", []Token{
+ {"nested clauses", "a:a (or t:b t!=c) or d<=01010001 and -T~foo t!re!bar", []Token{
{Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
{TOK_CAT_AUTHOR, "a"}, {TOK_OP_AP, ":"}, {TOK_VAL_STR, "a"},
{Type: TOK_CLAUSE_START}, {TOK_CLAUSE_OR, "or"},
@@ -88,6 +89,7 @@ func TestLex(t *testing.T) {
{TOK_CAT_DATE, "d"}, {TOK_OP_LE, "<="}, {TOK_VAL_DATETIME, "01010001"},
{Type: TOK_CLAUSE_START}, {TOK_CLAUSE_AND, "and"},
{TOK_OP_NEG, "-"}, {TOK_CAT_TITLE, "T"}, {TOK_OP_AP, "~"}, {TOK_VAL_STR, "foo"},
+ {TOK_CAT_TAGS, "t"}, {TOK_OP_RE, "!re!"}, {TOK_VAL_STR, "bar"},
{Type: TOK_CLAUSE_END},
{Type: TOK_CLAUSE_END},
{Type: TOK_CLAUSE_END},
diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go
index 3c435e8..2cc610a 100644
--- a/pkg/query/optimizer.go
+++ b/pkg/query/optimizer.go
@@ -1,6 +1,7 @@
package query
import (
+ "bytes"
"slices"
"strings"
"sync"
@@ -52,7 +53,6 @@ func (o Optimizer) Optimize(level int) {
if level < 0 {
return
} else if level == 0 {
- // TODO: determine smarter level determination strategy
level = o.root.Depth()
}
@@ -63,6 +63,7 @@ func (o Optimizer) Optimize(level int) {
o.StrictEquality()
o.Tighten()
o.Contradictions()
+ o.MergeRegex()
// parallel + serial
o.Tidy()
// purely serial
@@ -326,6 +327,66 @@ func (o Optimizer) StrictEquality() {
})
}
+// Merge regular within a clause
+func (o *Optimizer) MergeRegex() {
+ if !o.isSorted {
+ o.SortStatements()
+ }
+
+ pool := &sync.Pool{}
+ pool.New = func() any {
+ return &bytes.Buffer{}
+ }
+
+ o.parallel(func(c *Clause) {
+ if c.Operator != COP_OR {
+ return
+ }
+
+ buf := pool.Get().(*bytes.Buffer)
+ defer pool.Put(buf)
+ defer buf.Reset()
+ sortChanged := false
+ for _, catStmts := range c.Statements.CategoryPartition() {
+ for op, opStmts := range catStmts.OperatorPartition() {
+ if op != OP_RE {
+ continue
+ }
+
+ for _, stmts := range opStmts.NegatedPartition() {
+ if len(stmts) <= 1 {
+ continue
+ }
+ sortChanged = true
+
+ for i, stmt := range stmts {
+ if i == 0 {
+ buf.WriteByte('(')
+ buf.WriteString(stmt.Value.(StringValue).S)
+ buf.WriteByte('|')
+ } else if i == len(stmts)-1 {
+ buf.WriteString(stmt.Value.(StringValue).S)
+ buf.WriteByte(')')
+ stmts[i] = Statement{}
+ } else {
+ buf.WriteString(stmt.Value.(StringValue).S)
+ buf.WriteByte('|')
+ stmts[i] = Statement{}
+ }
+ }
+ stmts[0].Value = StringValue{S: buf.String()}
+ buf.Reset()
+ }
+
+ break
+ }
+ }
+ if sortChanged {
+ o.isSorted = false
+ }
+ })
+}
+
// Shrink approximate statements and ranges
//
// Examples:
diff --git a/pkg/query/optimizer_test.go b/pkg/query/optimizer_test.go
index 1826c81..bb9c0ba 100644
--- a/pkg/query/optimizer_test.go
+++ b/pkg/query/optimizer_test.go
@@ -647,3 +647,75 @@ func TestOptimizer_Tighten(t *testing.T) {
})
}
}
+
+func TestOptimizer_MergeRegex(t *testing.T) {
+ tests := []struct {
+ name string
+ c *query.Clause
+ want query.Clause
+ }{
+ {
+ "only positve",
+ &query.Clause{
+ Operator: query.COP_OR,
+ Statements: []query.Statement{
+ {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"a"}},
+ {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"b"}},
+ },
+ },
+ query.Clause{
+ Operator: query.COP_OR,
+ Statements: []query.Statement{
+ {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"(a|b)"}},
+ },
+ },
+ },
+ {
+ "multiple categories",
+ &query.Clause{
+ Operator: query.COP_OR,
+ Statements: []query.Statement{
+ {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"a"}},
+ {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"b"}},
+ {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"c"}},
+ {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"d"}},
+ },
+ },
+ query.Clause{
+ Operator: query.COP_OR,
+ Statements: []query.Statement{
+ {Category: CAT_TITLE, Operator: OP_RE, Value: query.StringValue{"(a|b)"}},
+ {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(c|d)"}},
+ },
+ },
+ },
+ {
+ "mixed negations",
+ &query.Clause{
+ Operator: query.COP_OR,
+ Statements: []query.Statement{
+ {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"a"}},
+ {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"b"}},
+ {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"c"}},
+ {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"d"}},
+ },
+ },
+ query.Clause{
+ Operator: query.COP_OR,
+ Statements: []query.Statement{
+ {Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(a|b)"}},
+ {Negated: true, Category: CAT_TAGS, Operator: OP_RE, Value: query.StringValue{"(c|d)"}},
+ },
+ },
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ o := query.NewOptimizer(tt.c, WORKERS)
+ o.MergeRegex()
+ o.Tidy()
+
+ clauseEqTest(t, tt.c, &tt.want)
+ })
+ }
+}
diff --git a/pkg/query/parser.go b/pkg/query/parser.go
index 0b33a37..ce4ee7b 100644
--- a/pkg/query/parser.go
+++ b/pkg/query/parser.go
@@ -36,6 +36,7 @@ const (
OP_LE // less than or equal
OP_GE // greater than or equal
OP_GT // greater than
+ OP_RE // regular expresion
)
type clauseOperator int16
@@ -165,7 +166,7 @@ func (t catType) String() string {
}
func (t opType) IsFuzzy() bool {
- return t == OP_AP || t.IsOrder()
+ return t == OP_AP || t == OP_RE || t.IsOrder()
}
func (t opType) IsOrder() bool {
@@ -188,6 +189,8 @@ func (t opType) String() string {
return "Greater Than or Equal"
case OP_GT:
return "Greater Than"
+ case OP_RE:
+ return "Regular Expression"
default:
return "Invalid"
}
@@ -234,6 +237,8 @@ func tokToOp(t queryTokenType) opType {
return OP_GE
case TOK_OP_GT:
return OP_GT
+ case TOK_OP_RE:
+ return OP_RE
default:
return OP_UNKNOWN
}
@@ -241,7 +246,7 @@ func tokToOp(t queryTokenType) opType {
// Apply negation to a statements operator
func (s *Statement) Simplify() {
- if s.Negated && s.Operator != OP_AP {
+ if s.Negated && s.Operator != OP_AP && s.Operator != OP_RE {
s.Negated = false
switch s.Operator {
case OP_EQ:
@@ -324,23 +329,15 @@ func (s Statements) NegatedPartition() iter.Seq2[bool, Statements] {
}
return func(yield func(bool, Statements) bool) {
- firstNegated := -1
- for i, stmt := range s {
- if stmt.Negated {
- firstNegated = i
- }
- }
-
- if firstNegated > 0 {
- if !yield(false, s[:firstNegated]) {
- return
- } else if !yield(true, s[firstNegated:]) {
- return
- }
- } else {
- if !yield(false, s) {
- return
- }
+ firstNegated := slices.IndexFunc(s, func(stmt Statement) bool { return stmt.Negated })
+ if firstNegated == -1 {
+ yield(false, s)
+ return
+ } else if firstNegated == 0 {
+ yield(true, s)
+ return
+ } else if !yield(false, s[:firstNegated]) {
+ } else if !yield(true, s[firstNegated:]) {
}
}
}
@@ -507,7 +504,7 @@ func Parse(tokens []Token) (*Clause, error) {
stmt := Statement{Category: tokToCat(token.Type)}
clause.Statements = append(clause.Statements, stmt)
}
- case TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT:
+ case TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT, TOK_OP_RE:
if !prevToken.Type.isCategory() {
return nil, &TokenError{
got: token,
@@ -518,21 +515,21 @@ func Parse(tokens []Token) (*Clause, error) {
clause.Statements[len(clause.Statements)-1].Operator = tokToOp(token.Type)
case TOK_VAL_STR:
- if !prevToken.Type.isOperation() {
+ if !prevToken.Type.isStringOperation() {
return nil, &TokenError{
got: token,
gotPrev: prevToken,
- wantPrev: "operation",
+ wantPrev: "string operation",
}
}
clause.Statements[len(clause.Statements)-1].Value = StringValue{token.Value}
case TOK_VAL_DATETIME:
- if !prevToken.Type.isOperation() {
+ if !prevToken.Type.isDateOperation() {
return nil, &TokenError{
got: token,
gotPrev: prevToken,
- wantPrev: "operation",
+ wantPrev: "date operation",
}
}
diff --git a/pkg/query/parser_test.go b/pkg/query/parser_test.go
index 5f68006..2e85123 100644
--- a/pkg/query/parser_test.go
+++ b/pkg/query/parser_test.go
@@ -26,6 +26,7 @@ const (
OP_LE = query.OP_LE
OP_GE = query.OP_GE
OP_GT = query.OP_GT
+ OP_RE = query.OP_RE
)
func TestParse(t *testing.T) {