From b6d1375c7bdf0f06eb32f1273ad5fd4f7c0e5673 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Sat, 28 Jun 2025 01:15:44 -0400 Subject: Add leveld optimization wrapper --- cmd/atlas.go | 45 ++++++++++++++++++----- pkg/query/compiler.go | 94 +++++++++++++----------------------------------- pkg/query/optimizer.go | 38 ++++++++++++++++++++ pkg/query/parser.go | 31 +++++++++++++++- pkg/shell/interpreter.go | 38 ++++++++++++++++---- pkg/shell/state.go | 91 +++++++++------------------------------------- 6 files changed, 177 insertions(+), 160 deletions(-) diff --git a/cmd/atlas.go b/cmd/atlas.go index b98b0f6..08e1a00 100644 --- a/cmd/atlas.go +++ b/cmd/atlas.go @@ -82,8 +82,9 @@ func main() { args := flag.Args() queryFlags := struct { - Output query.Outputer - CustomFormat string + Output query.Outputer + CustomFormat string + OptimizationLevel int }{} indexFlags := struct { Filters []index.DocFilter @@ -119,6 +120,7 @@ func main() { return fmt.Errorf("Unrecognized output format: %s", arg) }) queryFs.StringVar(&queryFlags.CustomFormat, "outCustomFormat", query.DefaultOutputFormat, "format string for --outFormat custom, see EXAMPLES for more details") + queryFs.IntVar(&queryFlags.OptimizationLevel, "optLevel", 0, "optimization `level` for queries, 0 is automatic, <0 to disable") queryFs.Parse(args[1:]) case "index": @@ -150,7 +152,7 @@ func main() { case "help": printHelp() flag.PrintDefaults() - os.Exit(0) + return case "shell": shellFs.Parse(args[1:]) default: @@ -186,16 +188,41 @@ func main() { querier := data.NewQuery(globalFlags.DBPath) defer querier.Close() + go func() { + if r := recover(); r != nil { + os.Exit(1) + } + }() + // command specific switch command { case "query": - // TODO: evaluate query - s, err := queryFlags.Output.Output(nil) + searchQuery := strings.Join(queryFs.Args(), " ") + tokens := query.Lex(searchQuery) + clause, err := query.Parse(tokens) + if err != nil { + fmt.Fprintln(os.Stderr, "Failed to parse query: ", err) + panic(err) + } + + if queryFlags.OptimizationLevel >= 0 { + o := query.NewOptimizer(clause, globalFlags.NumWorkers) + o.Optimize(queryFlags.OptimizationLevel) + } + + artifact, err := clause.Compile() if err != nil { - slog.Error("Error while outputing query results", slog.String("err", err.Error())) - return + panic(err) } - fmt.Print(s) + fmt.Println("query\n", artifact.Query) + fmt.Println("args\n", strings.Join(artifact.Args, ", ")) + // TODO: evaluate query + // s, err := queryFlags.Output.Output(nil) + // if err != nil { + // slog.Error("Error while outputing query results", slog.String("err", err.Error())) + // return + // } + // fmt.Print(s) case "index": idx := index.Index{Root: globalFlags.IndexRoot, Filters: indexFlags.Filters} if logger.Enabled(context.Background(), slog.LevelDebug) { @@ -233,7 +260,7 @@ func main() { interpreter := shell.NewInterpreter(state, env, globalFlags.NumWorkers) if err := interpreter.Run(); err != nil && err != io.EOF { slog.Error("Fatal error occured", slog.String("err", err.Error())) - os.Exit(1) + panic(err) } } diff --git a/pkg/query/compiler.go b/pkg/query/compiler.go index f0df4b4..e6418d5 100644 --- a/pkg/query/compiler.go +++ b/pkg/query/compiler.go @@ -9,77 +9,27 @@ import ( const MAX_CLAUSE_DEPTH int = 16 -func (stmt Statement) Compile(b *strings.Builder) (*string, error) { - if stmt.Negated { - b.WriteString("NOT ") - } +type CompilationArtifact struct { + Query string + Args []string +} - switch stmt.Category { - case CAT_TITLE: - b.WriteString("title ") - case CAT_AUTHOR: - b.WriteString("author ") - case CAT_DATE: - b.WriteString("date ") - case CAT_FILETIME: - b.WriteString("fileTime ") - case CAT_TAGS: - b.WriteString("tags ") - case CAT_LINKS: - b.WriteString("links ") - default: - return nil, &CompileError{ - fmt.Sprint("unknown or invalid category ", stmt.Category.String()), - } - } - switch stmt.Operator { - case OP_EQ: - if stmt.Category.IsSet() { - b.WriteString("IN ") +func (art CompilationArtifact) String() string { + b := strings.Builder{} + fmt.Fprintln(&b, art.Query) + b.WriteByte('[') + for i, arg := range art.Args { + if i != len(art.Args)-1 { + fmt.Fprintf(&b, "`%s`, ", arg) } else { - b.WriteString("= ") - } - case OP_AP: - b.WriteString("LIKE ") - case OP_NE: - b.WriteString("!= ") - case OP_LT: - b.WriteString("< ") - case OP_LE: - b.WriteString("<= ") - case OP_GE: - b.WriteString(">= ") - case OP_GT: - b.WriteString("> ") - default: - return nil, &CompileError{ - fmt.Sprint("unknown or invalid operand ", stmt.Operator.String()), + fmt.Fprintf(&b, "`%s`", arg) } } - - switch stmt.Value.Type() { - case VAL_STR: - s, ok := stmt.Value.(StringValue) - if !ok { - panic(CompileError{"type corruption in string value"}) - } - b.WriteString("(?) ") - return &s.S, nil - case VAL_DATETIME: - dt, ok := stmt.Value.(DatetimeValue) - if !ok { - panic(CompileError{"type corruption in datetime value"}) - } - fmt.Fprint(b, dt.D.Unix(), " ") - default: - return nil, &CompileError{ - fmt.Sprint("unknown or invalid value type ", stmt.Value.Type()), - } - } - return nil, nil + b.WriteByte(']') + return b.String() } -func (s Statements) Compile(b *strings.Builder, delim string) ([]string, error) { +func (s Statements) buildCompile(b *strings.Builder, delim string) ([]string, error) { var args []string sCount := 0 @@ -185,6 +135,7 @@ func (s Statements) Compile(b *strings.Builder, delim string) ([]string, error) start, end := util.FuzzDatetime(d.D) + b.WriteString("NOT ") b.WriteString(opStr) fmt.Fprint(b, start.Unix(), " ") b.WriteString("AND ") @@ -199,6 +150,9 @@ func (s Statements) Compile(b *strings.Builder, delim string) ([]string, error) } else { idx := 0 for _, stmt := range opStmts { + if stmt.Negated { + b.WriteString("NOT ") + } b.WriteString(catStr) b.WriteString(opStr) arg, ok := stmt.Value.buildCompile(b) @@ -229,9 +183,9 @@ func (s Statements) Compile(b *strings.Builder, delim string) ([]string, error) return args, nil } -func (root Clause) Compile() (string, []string, error) { +func (root Clause) Compile() (CompilationArtifact, error) { if d := root.Depth(); d > MAX_CLAUSE_DEPTH { - return "", nil, &CompileError{ + return CompilationArtifact{}, &CompileError{ fmt.Sprintf("exceeded maximum clause depth: %d > %d", d, MAX_CLAUSE_DEPTH), } } @@ -239,9 +193,9 @@ func (root Clause) Compile() (string, []string, error) { b := strings.Builder{} args, err := root.buildCompile(&b) if err != nil { - return "", nil, err + return CompilationArtifact{}, err } - return b.String(), args, nil + return CompilationArtifact{b.String(), args}, nil } func (c Clause) buildCompile(b *strings.Builder) ([]string, error) { @@ -257,7 +211,7 @@ func (c Clause) buildCompile(b *strings.Builder) ([]string, error) { return nil, &CompileError{fmt.Sprint("invalid clause operator ", c.Operator)} } - args, err := c.Statements.Compile(b, delim) + args, err := c.Statements.buildCompile(b, delim) if err != nil { return nil, err } diff --git a/pkg/query/optimizer.go b/pkg/query/optimizer.go index c203b9c..cdb2455 100644 --- a/pkg/query/optimizer.go +++ b/pkg/query/optimizer.go @@ -47,6 +47,38 @@ func NewOptimizer(root *Clause, workers uint) Optimizer { } } +// Optimize clause according to level. +// level 0 is automatic and levels < 0 do nothing. +func (o Optimizer) Optimize(level int) { + o.Simplify() + if level < 0 { + return + } else if level == 0 { + // TODO: determine smarter level determination strategy + level = o.root.Depth() + } + + oldDepth := o.root.Depth() + for range level { + // clause level parallel + o.Compact() + o.StrictEquality() + o.Tighten() + o.Contradictions() + // parallel + serial + o.Tidy() + // purely serial + o.Flatten() + + depth := o.root.Depth() + if depth == oldDepth { + break + } else { + oldDepth = depth + } + } +} + // Perform optimizations in parallel. They should **NOT** mutate the tree func (o Optimizer) parallel(optimize func(*Clause)) { jobs := make(chan *Clause, o.workers) @@ -131,6 +163,12 @@ func (o *Optimizer) Flatten() { }) } +// Remove multiples of equivalent statements within the same clause +// +// Examples +// +// (and a="Fred Flinstone" a="Fred Flinstone") --> (and a="Fred Flinstone") +// (or a=Shaggy -a!=Shaggy) --> (or a=Shaggy) func (o *Optimizer) Compact() { o.parallel(func(c *Clause) { c.Statements = slices.CompactFunc(c.Statements, StatementEq) diff --git a/pkg/query/parser.go b/pkg/query/parser.go index 178665d..c52530e 100644 --- a/pkg/query/parser.go +++ b/pkg/query/parser.go @@ -322,6 +322,35 @@ func (s Statements) OperatorPartition() iter.Seq2[opType, Statements] { } } +// Partition statements by their negated status without copying, similar to +// CategoryPartition and OperatorPartition +func (s Statements) NegatedPartition() iter.Seq2[bool, Statements] { + if !slices.IsSortedFunc(s, StatementCmp) { + slices.SortFunc(s, StatementCmp) + } + + return func(yield func(bool, Statements) bool) { + firstNegated := -1 + for i, stmt := range s { + if stmt.Negated { + firstNegated = i + } + } + + if firstNegated > 0 { + if !yield(false, s[:firstNegated]) { + return + } else if !yield(true, s[firstNegated:]) { + return + } + } else { + if !yield(false, s) { + return + } + } + } +} + func (c Clause) String() string { b := &strings.Builder{} c.buildString(b, 0) @@ -459,7 +488,7 @@ func Parse(tokens []Token) (*Clause, error) { } clause.Operator = COP_OR case TOK_OP_NEG: - if !prevToken.Type.Any(TOK_CLAUSE_OR, TOK_CLAUSE_AND, TOK_VAL_STR, TOK_VAL_DATETIME) { + if !prevToken.Type.Any(TOK_CLAUSE_OR, TOK_CLAUSE_AND, TOK_VAL_STR, TOK_VAL_DATETIME, TOK_CLAUSE_END) { return nil, &TokenError{ got: token, gotPrev: prevToken, diff --git a/pkg/shell/interpreter.go b/pkg/shell/interpreter.go index 6c97619..cf1454a 100644 --- a/pkg/shell/interpreter.go +++ b/pkg/shell/interpreter.go @@ -55,6 +55,7 @@ const ( ITOK_CMD_SLICE ITOK_CMD_REMATCH ITOK_CMD_REPATTERN + ITOK_CMD_LVL_OPTIMIZE ITOK_CMD_OPTIMIZE ITOK_CMD_TOKENIZE ITOK_CMD_PARSE @@ -88,6 +89,7 @@ var commands = []string{ "tokenize", "parse", "compile", + "optimize_", } func NewInterpreter(initialState State, env map[string]string, workers uint) *Interpreter { @@ -207,7 +209,10 @@ out: } return false, fmt.Errorf("No variable %s%s", tokens[1].Text, suggestionText) } + fmt.Fprintln(w, carryValue) + carryValue.Type = VAL_INVALID } + break out case ITOK_CMD_REMATCH: if carryValue.Type != VAL_STRING { return false, fmt.Errorf("Unable to match against argument of type %s", carryValue.Type) @@ -259,6 +264,25 @@ out: if err != nil { return false, err } + carryValue.Type = VAL_CLAUSE + carryValue.Val = clause + case ITOK_CMD_LVL_OPTIMIZE: + if carryValue.Type != VAL_CLAUSE { + return false, fmt.Errorf("Unable to optimize argument of type: %s", carryValue) + } + + level, err := strconv.Atoi(t.Text) + if err != nil { + return false, fmt.Errorf("Cannot parse optimization level %s", t.Text) + } + + clause, ok := carryValue.Val.(*query.Clause) + if !ok { + return true, errors.New("Type corruption during optimization, expected *query.Clause") + } + o := query.NewOptimizer(clause, inter.Workers) + o.Optimize(level) + carryValue.Type = VAL_CLAUSE carryValue.Val = clause case ITOK_CMD_OPTIMIZE: @@ -315,14 +339,13 @@ out: return true, errors.New("Type corruption during compilation, expected *query.Clause") } - query, params, err := clause.Compile() + artifact, err := clause.Compile() if err != nil { return false, err } - fmt.Fprintf(w, "query:\n%s\n--------\nparams:\n%s\n", query, params) - carryValue.Type = VAL_INVALID - break out + carryValue.Val = artifact + carryValue.Type = VAL_ARTIFACT case ITOK_VAR_NAME: // NOTE: very brittle, only allows expansion of a single variable if i == len(tokens)-1 { @@ -437,6 +460,8 @@ func (inter Interpreter) Tokenize(line string) []IToken { tokens = append(tokens, IToken{Type: ITOK_CMD_TOKENIZE}) } else if trimmedWord == "parse" { tokens = append(tokens, IToken{Type: ITOK_CMD_PARSE}) + } else if l := len("optimize_"); len(trimmedWord) > l && trimmedWord[:l] == "optimize_" { + tokens = append(tokens, IToken{ITOK_CMD_LVL_OPTIMIZE, trimmedWord[l:]}) } else if l := len("env_"); len(trimmedWord) > l && trimmedWord[:l] == "env_" { tokens = append(tokens, IToken{ITOK_CMD_ENV, trimmedWord[l:]}) } else if l := len("opt_"); len(trimmedWord) > l && trimmedWord[:l] == "opt_" { @@ -463,7 +488,7 @@ func (inter Interpreter) Tokenize(line string) []IToken { } else { tokens = append(tokens, IToken{ITOK_VAR_NAME, trimmedWord}) } - } else if prevType == ITOK_CMD_PARSE || prevType == ITOK_CMD_OPTIMIZE || prevType == ITOK_CMD_COMPILE { + } else if prevType == ITOK_CMD_PARSE || prevType == ITOK_CMD_OPTIMIZE || prevType == ITOK_CMD_LVL_OPTIMIZE || prevType == ITOK_CMD_COMPILE { tokens = append(tokens, IToken{ITOK_VAR_NAME, trimmedWord}) } else if prevType == ITOK_VAR_NAME && trimmedWord[0] == '`' { _, strLiteral, _ := strings.Cut(word, "`") @@ -497,7 +522,8 @@ func printHelp(w io.Writer) { fmt.Fprintln(w, "tokenize (string|name) - tokenize a string") fmt.Fprintln(w, " ex. tokenize `author:me") fmt.Fprintln(w, "parse (tokens|name) - parse tokens into a clause") - fmt.Fprintln(w, "opt_ (clause|name) - optimize clause tree") + fmt.Fprintln(w, "optimize_ (clause|name) - optimize clause tree to ") + fmt.Fprintln(w, "opt_ (clause|name) - apply specific optimization to clause tree") fmt.Fprintln(w, " sort - sort statements") fmt.Fprintln(w, " flatten - flatten clauses") fmt.Fprintln(w, " compact - compact equivalent statements") diff --git a/pkg/shell/state.go b/pkg/shell/state.go index bd7ecc8..5acdaf1 100644 --- a/pkg/shell/state.go +++ b/pkg/shell/state.go @@ -1,9 +1,7 @@ package shell import ( - "errors" "fmt" - "os" "strings" "github.com/jpappel/atlas/pkg/query" @@ -17,6 +15,7 @@ const ( VAL_STRING VAL_TOKENS VAL_CLAUSE + VAL_ARTIFACT ) type Value struct { @@ -38,6 +37,8 @@ func (t ValueType) String() string { return "Tokens" case VAL_CLAUSE: return "Clause" + case VAL_ARTIFACT: + return "Compilation Artifact" default: return "Unkown" } @@ -48,27 +49,33 @@ func (v Value) String() string { case VAL_INT: i, ok := v.Val.(int) if !ok { - return "Corrupted Type (expected int)" + panic("Corrupted Type (expected int)") } return fmt.Sprint(i) case VAL_STRING: s, ok := v.Val.(string) if !ok { - return "Corrupted Type (expected string)" + panic("Corrupted Type (expected string)") } return s case VAL_TOKENS: ts, ok := v.Val.([]query.Token) if !ok { - return "Corrupted Type (expected []query.Token)" + panic("Corrupted Type (expected []query.Token)") } return query.TokensStringify(ts) case VAL_CLAUSE: - rootClause, ok := v.Val.(*query.Clause) + clause, ok := v.Val.(*query.Clause) if !ok { - return "Corrupted Type (expected *query.Clause)" + panic("Corrupted Type (expected *query.Clause)") } - return rootClause.String() + return clause.String() + case VAL_ARTIFACT: + artifact, ok := v.Val.(query.CompilationArtifact) + if !ok { + panic("Corrupted Type (expected query.CompilationArtifact)") + } + return artifact.String() case VAL_INVALID: return "Invalid" } @@ -92,6 +99,8 @@ func (s State) String() string { b.WriteString(" Tokens") case VAL_CLAUSE: b.WriteString(" Clause") + case VAL_ARTIFACT: + b.WriteString(" Artifact") default: fmt.Fprintf(&b, " Unknown (%d)", v.Val) } @@ -100,69 +109,3 @@ func (s State) String() string { return b.String() } - -func (s State) CmdTokenize(input string) (Value, bool) { - if len(input) == 0 { - return Value{}, false - } - - var rawQuery string - if input[0] == '`' { - rawQuery = input[1:] - } else { - variable, ok := s[input] - if !ok { - fmt.Fprintln(os.Stderr, "Cannot tokenize: no variable with name", input) - return Value{}, false - } else if variable.Type != VAL_STRING { - fmt.Fprintln(os.Stderr, "Cannot tokenize: variable is not a string") - return Value{}, false - } - - rawQuery, ok = variable.Val.(string) - if !ok { - fmt.Fprintln(os.Stderr, "Cannot tokenize: type corruption") - fmt.Fprintln(os.Stderr, "Type corruption, expected string") - panic("Type corruption") - } - } - tokens := query.Lex(rawQuery) - return Value{VAL_TOKENS, tokens}, true -} - -func (s State) CmdParse(args string) (Value, error) { - if len(args) == 0 { - return Value{}, errors.New("no arguments for parse") - } - - var tokens []query.Token - if tokenizeArgs, found := strings.CutPrefix(args, "tokenize "); found { - val, ok := s.CmdTokenize(tokenizeArgs) - if !ok { - return Value{}, errors.New("error occured during tokenization") - } - tokens = val.Val.([]query.Token) - } else { - variable, ok := s[args] - if !ok { - fmt.Fprintln(os.Stderr, "Cannot parse: no variable with name", args) - return Value{}, errors.New("variable does not exist") - } else if variable.Type != VAL_TOKENS { - fmt.Fprintln(os.Stderr, "Cannot parse: variable is not []query.Tokens") - return Value{}, errors.New("bad variable type") - } - - tokens, ok = variable.Val.([]query.Token) - if !ok { - fmt.Fprintln(os.Stderr, "Cannot parse: type corruption") - fmt.Fprintln(os.Stderr, "Type corruption, expected []query.Tokens") - panic("Type corruption") - } - } - - clause, err := query.Parse(tokens) - if err != nil { - return Value{}, err - } - return Value{VAL_CLAUSE, *clause}, err -} -- cgit v1.2.3