pkg/query/lexer.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345

package query

import (
	"fmt"
	"regexp"
	"slices"
	"strings"
)

type queryTokenType int

var LexRegex *regexp.Regexp
var LexRegexPattern string

const (
	TOK_UNKNOWN queryTokenType = iota

	// clause tokens
	TOK_CLAUSE_OR  // clause or
	TOK_CLAUSE_AND // clause and
	TOK_CLAUSE_START
	TOK_CLAUSE_END

	// statement tokens
	TOK_OP_NEG  // negation
	TOK_OP_EQ   // equal
	TOK_OP_AP   // approximate/fuzzy
	TOK_OP_NE   // not equal
	TOK_OP_LT   // less than
	TOK_OP_LE   // less than or equal
	TOK_OP_GE   // greater than or equal
	TOK_OP_GT   // greaterthan
	TOK_OP_PIPE // external pipe
	TOK_OP_ARG  // external arg
	// categories
	TOK_CAT_TITLE
	TOK_CAT_AUTHOR
	TOK_CAT_DATE
	TOK_CAT_FILETIME
	TOK_CAT_TAGS
	TOK_CAT_LINKS
	TOK_CAT_META
	// values
	TOK_VAL_STR
	TOK_VAL_DATETIME
)

type Token struct {
	Type  queryTokenType
	Value string
}

func (tokType queryTokenType) String() string {
	switch tokType {
	case TOK_UNKNOWN:
		return "Unknown"
	case TOK_CLAUSE_OR:
		return "Or"
	case TOK_CLAUSE_AND:
		return "And"
	case TOK_CLAUSE_START:
		return "Start Clause"
	case TOK_CLAUSE_END:
		return "End Clause"
	case TOK_OP_NEG:
		return "Negation"
	case TOK_OP_EQ:
		return "Equal"
	case TOK_OP_AP:
		return "Approximate"
	case TOK_OP_NE:
		return "Not Equal"
	case TOK_OP_LT:
		return "Less Than"
	case TOK_OP_LE:
		return "Less Than or Equal"
	case TOK_OP_GE:
		return "Greater Than or Equal"
	case TOK_OP_GT:
		return "Greater Than"
	case TOK_OP_PIPE:
		return "Pipe External Command"
	case TOK_OP_ARG:
		return "Argument External Command"
	case TOK_CAT_TITLE:
		return "Title Category"
	case TOK_CAT_AUTHOR:
		return "Author Category"
	case TOK_CAT_DATE:
		return "Date Category"
	case TOK_CAT_FILETIME:
		return "Filetime Category"
	case TOK_CAT_TAGS:
		return "Tags Category"
	case TOK_CAT_LINKS:
		return "Links Category"
	case TOK_CAT_META:
		return "Metadata Category"
	case TOK_VAL_DATETIME:
		return "Datetime Value"
	case TOK_VAL_STR:
		return "String Value"
	default:
		return "Invalid"
	}
}

func (t Token) String() string {
	return fmt.Sprint(t.Type.String(), ": ", t.Value)
}

func (t Token) Equal(other Token) bool {
	if t.Type.isValue() {
		return t.Type == other.Type && t.Value == other.Value
	}
	return t.Type == other.Type
}

// if a token type is one of any
func (tokType queryTokenType) Any(expected ...queryTokenType) bool {
	return slices.Contains(expected, tokType)
}

func (t queryTokenType) isClause() bool {
	return t == TOK_CLAUSE_OR || t == TOK_CLAUSE_AND || t == TOK_CLAUSE_START || t == TOK_CLAUSE_END
}

func (t queryTokenType) isCategory() bool {
	return t.Any(TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_DATE, TOK_CAT_FILETIME, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META)
}
func (t queryTokenType) isOperation() bool {
	return t.Any(TOK_OP_EQ, TOK_OP_AP, TOK_OP_NE, TOK_OP_LT, TOK_OP_LE, TOK_OP_GE, TOK_OP_GT, TOK_OP_PIPE, TOK_OP_ARG)
}
func (t queryTokenType) isValue() bool {
	return t == TOK_VAL_STR || t == TOK_VAL_DATETIME
}

func Lex(query string) []Token {
	const (
		MATCH = iota
		CLAUSE_START
		CLAUSE_OPERATOR
		STATEMENT
		NEGATION
		CATEGORY
		OPERATOR
		VALUE
		UNKNOWN
		CLAUSE_END
	)

	matches := LexRegex.FindAllStringSubmatch(query, -1)
	tokens := make([]Token, 0, 4*len(matches))

	tokens = append(tokens, Token{Type: TOK_CLAUSE_START})
	tokens = append(tokens, Token{TOK_CLAUSE_AND, "and"}) // default to and'ing all args
	clauseLevel := 1
	for _, match := range matches {
		if match[CLAUSE_START] != "" {
			tokens = append(tokens, Token{Type: TOK_CLAUSE_START})
			// TODO: set maximum nest level
			clauseLevel += 1
		}
		if match[CLAUSE_OPERATOR] != "" {
			if len(tokens) == 0 || tokens[len(tokens)-1].Type != TOK_CLAUSE_START {
				tokens = append(tokens, Token{Type: TOK_CLAUSE_START})
				clauseLevel += 1
			}
			tokens = append(tokens, tokenizeClauseOperation(match[CLAUSE_OPERATOR]))
		}

		if t, ok := tokenizeNegation(match[NEGATION]); ok {
			tokens = append(tokens, t)
		}

		if match[CATEGORY] != "" {
			tokens = append(tokens, tokenizeCategory(match[CATEGORY]))
		}
		if match[OPERATOR] != "" {
			tokens = append(tokens, tokenizeOperation(match[OPERATOR]))
		}
		if match[VALUE] != "" {
			tokens = append(tokens, tokenizeValue(match[VALUE], tokens[len(tokens)-2].Type))
		}

		if match[UNKNOWN] != "" {
			tokens = append(tokens, Token{Value: match[UNKNOWN]})
		}

		if match[CLAUSE_END] != "" {
			tokens = append(tokens, Token{Type: TOK_CLAUSE_END})
			// TODO: raise err if clauseLevel is 0
			clauseLevel -= 1
		}
	}

	for range clauseLevel {
		tokens = append(tokens, Token{Type: TOK_CLAUSE_END})
	}

	return tokens
}

func tokenizeClauseOperation(s string) Token {
	t := Token{Value: s}
	switch s {
	case "and", "AND":
		t.Type = TOK_CLAUSE_AND
	case "or", "OR":
		t.Type = TOK_CLAUSE_OR
	}
	return t
}

func tokenizeNegation(s string) (Token, bool) {
	t := Token{Value: s}
	if s == "-" {
		t.Type = TOK_OP_NEG
	}

	return t, len(s) > 0
}

func tokenizeOperation(s string) Token {
	t := Token{Value: s}
	switch s {
	case "!=":
		t.Type = TOK_OP_NE
	case "!+":
		t.Type = TOK_OP_ARG
	case "<=":
		t.Type = TOK_OP_LE
	case ">=":
		t.Type = TOK_OP_GE
	case "=":
		t.Type = TOK_OP_EQ
	case ":", "~":
		t.Type = TOK_OP_AP
	case "<":
		t.Type = TOK_OP_LT
	case ">":
		t.Type = TOK_OP_GT
	case "!":
		t.Type = TOK_OP_PIPE
	}

	return t
}

func tokenizeCategory(s string) Token {
	t := Token{Value: s}
	switch s {
	case "T", "title":
		t.Type = TOK_CAT_TITLE
	case "a", "author":
		t.Type = TOK_CAT_AUTHOR
	case "d", "date":
		t.Type = TOK_CAT_DATE
	case "f", "filetime":
		t.Type = TOK_CAT_FILETIME
	case "t", "tags":
		t.Type = TOK_CAT_TAGS
	case "l", "links":
		t.Type = TOK_CAT_LINKS
	case "m", "meta":
		t.Type = TOK_CAT_META
	}
	return t
}

func tokenizeValue(s string, catType queryTokenType) Token {
	t := Token{}
	if len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"' {
		t.Value = s[1 : len(s)-1]
	} else {
		t.Value = s
	}
	switch catType {
	case TOK_CAT_DATE, TOK_CAT_FILETIME:
		t.Type = TOK_VAL_DATETIME
	case TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META:
		t.Type = TOK_VAL_STR
	}
	return t
}

func TokensStringify(tokens []Token) string {
	b := strings.Builder{}

	indentLvl := 0
	writeToken := func(t Token) {
		b.WriteByte('`')
		b.WriteString(t.String())
		b.WriteByte('`')
	}

	for i, token := range tokens {
		switch token.Type {
		case TOK_CLAUSE_START:
			writeIndent(&b, indentLvl)
			b.WriteByte('(')
		case TOK_CLAUSE_END:
			indentLvl -= 1
			writeIndent(&b, indentLvl)
			b.WriteString(")\n")
		case TOK_CLAUSE_OR:
			b.WriteString("or\n")
			indentLvl += 1
		case TOK_CLAUSE_AND:
			b.WriteString("and\n")
			indentLvl += 1
		case TOK_CAT_TITLE, TOK_CAT_AUTHOR, TOK_CAT_DATE, TOK_CAT_FILETIME, TOK_CAT_TAGS, TOK_CAT_LINKS, TOK_CAT_META, TOK_OP_NEG:
			if i == 0 || tokens[i-1].Type != TOK_OP_NEG {
				writeIndent(&b, indentLvl)
			}
			writeToken(token)
		case TOK_VAL_STR, TOK_VAL_DATETIME, TOK_UNKNOWN:
			writeToken(token)
			b.WriteByte('\n')
		default:
			writeToken(token)
		}
	}

	return b.String()
}

func init() {
	negPattern := `(?<negation>-?)`
	categoryPattern := `(?<category>T|a(?:uthor)?|d(?:ate)?|f(?:iletime)?|t(?:ags|itle)?|l(?:inks)?|m(?:eta)?)`
	opPattern := `(?<operator>!=|!+|<=|>=|=|:|~|<|>|!)`
	valPattern := `(?<value>".*?"|\S*[^\s\)])`
	statementPattern := `(?<statement>` + negPattern + categoryPattern + opPattern + valPattern + `)`
	unknownPattern := `(?<unknown>\S*".*?"[^\s)]*|\S*[^\s\)])`

	clauseOpPattern := `(?<clause_operator>(?i)and|or)?`
	clauseStart := `(?<clause_start>\()?`
	clauseEnd := `(?<clause_end>\))?`
	clausePattern := clauseStart + `\s*` + clauseOpPattern + `\s*(?:` + statementPattern + `|` + unknownPattern + `)\s*` + clauseEnd + `\s*`
	LexRegexPattern = clausePattern

	// FIXME: fails to match start of clauses with no values
	//        example: (and (or ... )) fails
	LexRegex = regexp.MustCompile(LexRegexPattern)
}