aboutsummaryrefslogtreecommitdiffstats
path: root/pkg
diff options
context:
space:
mode:
Diffstat (limited to 'pkg')
-rw-r--r--pkg/data/db.go259
-rw-r--r--pkg/data/db_test.go65
-rw-r--r--pkg/data/get.go330
-rw-r--r--pkg/data/get_test.go217
-rw-r--r--pkg/data/put.go384
-rw-r--r--pkg/data/put_test.go98
-rw-r--r--pkg/index/filters.go84
-rw-r--r--pkg/index/filters_test.go112
-rw-r--r--pkg/index/index.go275
-rw-r--r--pkg/index/index_test.go138
-rw-r--r--pkg/index/schema.sql70
-rw-r--r--pkg/query/data_structures.go81
-rw-r--r--pkg/query/lang.md12
-rw-r--r--pkg/query/parser.go19
-rw-r--r--pkg/query/query.go50
15 files changed, 2194 insertions, 0 deletions
diff --git a/pkg/data/db.go b/pkg/data/db.go
new file mode 100644
index 0000000..fcf56f9
--- /dev/null
+++ b/pkg/data/db.go
@@ -0,0 +1,259 @@
+package data
+
+import (
+ "context"
+ "database/sql"
+ "strings"
+
+ "github.com/jpappel/atlas/pkg/index"
+ _ "github.com/mattn/go-sqlite3"
+)
+
+type Query struct {
+ db *sql.DB
+}
+
+// Append n copies of val to query
+//
+// output is in the form
+//
+// <query> <start><(n-1)*(<val><delim)>><val><delim><stop>
+func BatchQuery[T any](query string, start string, val string, delim string, stop string, n int, baseArgs []T) (string, []any) {
+ args := make([]any, len(baseArgs))
+ for i, arg := range baseArgs {
+ args[i] = arg
+ }
+
+ b := strings.Builder{}
+ b.Grow(len(query) + 1 + len(start) + n*len(val) + ((n - 1) * len(delim)) + len(stop))
+
+ b.WriteString(query)
+ b.WriteRune(' ')
+ b.WriteString(start)
+ for range n - 1 {
+ b.WriteString(val)
+ b.WriteString(delim)
+ }
+ b.WriteString(val)
+ b.WriteString(stop)
+
+ return b.String(), args
+}
+
+func NewQuery(filename string) *Query {
+ query := &Query{NewDB(filename)}
+ return query
+}
+
+func NewDB(filename string) *sql.DB {
+ connStr := "file:" + filename + "?_fk=true&_journal=WAL"
+ db, err := sql.Open("sqlite3", connStr)
+ if err != nil {
+ panic(err)
+ }
+
+ if err := createSchema(db); err != nil {
+ panic(err)
+ }
+
+ return db
+}
+
+func NewMemDB() *sql.DB {
+ db, err := sql.Open("sqlite3", ":memory:?_fk=true")
+ if err != nil {
+ panic(err)
+ }
+
+ if err := createSchema(db); err != nil {
+ panic(err)
+ }
+
+ return db
+}
+
+func createSchema(db *sql.DB) error {
+ ctx := context.TODO()
+ tx, err := db.BeginTx(ctx, nil)
+ if err != nil {
+ return err
+ }
+ defer tx.Commit()
+
+ _, err = tx.Exec(`
+ CREATE TABLE IF NOT EXISTS Indexes(
+ root TEXT NOT NULL,
+ followSym DATE
+ )`)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec(`
+ CREATE TABLE IF NOT EXISTS Documents(
+ id INTEGER PRIMARY KEY,
+ path TEXT UNIQUE NOT NULL,
+ title TEXT,
+ date INT,
+ fileTime INT,
+ meta BLOB
+ )`)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec(`
+ CREATE TABLE IF NOT EXISTS Authors(
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL
+ )`)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec(`
+ CREATE TABLE IF NOT EXISTS Aliases(
+ authorId INT NOT NULL,
+ alias TEXT UNIQUE NOT NULL,
+ FOREIGN KEY (authorId) REFERENCES Authors(id)
+ )`)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec(`
+ CREATE TABLE IF NOT EXISTS Tags(
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL
+ )`)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec(`
+ CREATE TABLE IF NOT EXISTS Links(
+ referencedId INT,
+ refererId INT,
+ FOREIGN KEY (referencedId) REFERENCES Documents(id),
+ FOREIGN KEY (refererId) REFERENCES Documents(id)
+ )`)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec(`
+ CREATE TABLE IF NOT EXISTS DocumentAuthors(
+ docId INT NOT NULL,
+ authorId INT NOT NULL,
+ FOREIGN KEY (docId) REFERENCES Documents(id),
+ FOREIGN KEY (authorId) REFERENCES Authors(id)
+ )`)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec(`
+ CREATE TABLE IF NOT EXISTS DocumentTags(
+ docId INT NOT NULL,
+ tagId INT NOT NULL,
+ FOREIGN KEY (docId) REFERENCES Documents(id),
+ FOREIGN KEY (tagId) REFERENCES Tags(id),
+ UNIQUE(docId, tagId)
+ )`)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_doc_dates ON Documents (date)")
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_doc_titles ON Documents (title)")
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_aliases_alias ON Aliases(alias)")
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_aliases_authorId ON Aliases(authorId)")
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_doctags_tagid ON DocumentTags (tagId)")
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ return nil
+}
+
+func (q Query) Close() error {
+ return q.db.Close()
+}
+
+// Create an index
+func (q Query) Get(indexRoot string) (*index.Index, error) {
+ ctx := context.TODO()
+
+ docs, err := FillMany{Db: q.db}.Get(ctx)
+ if err != nil {
+ return nil, err
+ }
+
+ idx := &index.Index{
+ Root: indexRoot,
+ Documents: docs,
+ Filters: index.DefaultFilters(),
+ }
+
+ return idx, nil
+}
+
+// Write from index to database
+func (q Query) Put(idx index.Index) error {
+ ctx := context.TODO()
+
+ insertCtx, cancel := context.WithCancelCause(ctx)
+ defer cancel(nil)
+
+ p, err := NewPutMany(q.db, idx.Documents)
+ if err != nil {
+ return err
+ }
+
+ if err := p.Insert(insertCtx); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// Update database with values from index
+func (q Query) Update(idx index.Index) error {
+ // TODO: implement
+ return nil
+}
+
+func (q Query) GetDocument(path string) (*index.Document, error) {
+ ctx := context.TODO()
+ f := Fill{Path: path, Db: q.db}
+ return f.Get(ctx)
+}
diff --git a/pkg/data/db_test.go b/pkg/data/db_test.go
new file mode 100644
index 0000000..b90a943
--- /dev/null
+++ b/pkg/data/db_test.go
@@ -0,0 +1,65 @@
+package data_test
+
+import (
+ "slices"
+ "testing"
+
+ "github.com/jpappel/atlas/pkg/data"
+)
+
+func TestBatchQuery(t *testing.T) {
+ tests := []struct {
+ name string
+ query string
+ start string
+ val string
+ delim string
+ stop string
+ n int
+ args []int
+ wantQuery string
+ wantArgs []any
+ }{
+ {
+ "1 val group",
+ "INSERT INTO Foo VALUES",
+ "(",
+ "?",
+ ",",
+ ")",
+ 5,
+ []int{1, 2, 3, 4, 5},
+ "INSERT INTO Foo VALUES (?,?,?,?,?)",
+ []any{1, 2, 3, 4, 5},
+ },
+ {
+ "multiple val groups",
+ "INSERT INTO Bar VALUES",
+ "",
+ "(?,?)",
+ ",",
+ "",
+ 2,
+ []int{1, 2, 3, 4},
+ "INSERT INTO Bar VALUES (?,?),(?,?)",
+ []any{1, 2, 3, 4},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ gotQuery, gotArgs := data.BatchQuery(tt.query, tt.start, tt.val, tt.delim, tt.stop, tt.n, tt.args)
+ if gotQuery != tt.wantQuery {
+ t.Error("Got different query than wanted")
+ t.Log("Wanted:\n" + tt.wantQuery)
+ t.Log("Got:\n" + gotQuery)
+ }
+
+ if !slices.Equal(tt.wantArgs, gotArgs) {
+ t.Error("Got different args than wanted")
+ t.Logf("Wanted:\t%v", tt.wantArgs)
+ t.Logf("Got:\t%v", gotArgs)
+ }
+ })
+ }
+}
diff --git a/pkg/data/get.go b/pkg/data/get.go
new file mode 100644
index 0000000..7cae03a
--- /dev/null
+++ b/pkg/data/get.go
@@ -0,0 +1,330 @@
+package data
+
+import (
+ "context"
+ "database/sql"
+ "time"
+
+ "github.com/jpappel/atlas/pkg/index"
+)
+
+// TODO: rename struct
+//
+// Use to build a document from a database connection
+type Fill struct {
+ Path string
+ Db *sql.DB
+ id int
+ doc *index.Document
+}
+
+// TODO: rename struct
+//
+// Use to build documents and aliases from a database connection
+type FillMany struct {
+ docs map[string]*index.Document
+ ids map[string]int
+ Db *sql.DB
+}
+
+func (f Fill) Get(ctx context.Context) (*index.Document, error) {
+ f.doc = &index.Document{Path: f.Path}
+ if err := f.document(ctx); err != nil {
+ return nil, err
+ }
+ if err := f.tags(ctx); err != nil {
+ return nil, err
+ }
+ if err := f.authors(ctx); err != nil {
+ return nil, err
+ }
+ if err := f.links(ctx); err != nil {
+ return nil, err
+ }
+
+ return f.doc, nil
+}
+
+func (f FillMany) Get(ctx context.Context) (map[string]*index.Document, error) {
+ f.docs = make(map[string]*index.Document)
+ f.ids = make(map[string]int)
+
+ if err := f.documents(ctx); err != nil {
+ return nil, err
+ }
+ if err := f.tags(ctx); err != nil {
+ return nil, err
+ }
+ if err := f.links(ctx); err != nil {
+ return nil, err
+ }
+ if err := f.authors(ctx); err != nil {
+ return nil, err
+ }
+
+ return f.docs, nil
+}
+
+func (f *Fill) document(ctx context.Context) error {
+ var title sql.NullString
+ var dateEpoch sql.NullInt64
+ var fileTimeEpoch sql.NullInt64
+ var meta sql.NullString
+
+ row := f.Db.QueryRowContext(ctx, `
+ SELECT id, title, date, fileTime, meta
+ FROM Documents
+ WHERE path = ?
+ `, f.Path)
+ if err := row.Scan(&f.id, &title, &dateEpoch, &fileTimeEpoch, &meta); err != nil {
+ return err
+ }
+
+ if title.Valid {
+ f.doc.Title = title.String
+ }
+ if dateEpoch.Valid {
+ f.doc.Date = time.Unix(dateEpoch.Int64, 0)
+ }
+ if fileTimeEpoch.Valid {
+ f.doc.FileTime = time.Unix(fileTimeEpoch.Int64, 0)
+ }
+ if meta.Valid {
+ f.doc.OtherMeta = meta.String
+ }
+ return nil
+}
+
+func (f *FillMany) documents(ctx context.Context) error {
+ rows, err := f.Db.QueryContext(ctx, `
+ SELECT id, path, title, date, fileTime, meta
+ FROM Documents
+ `)
+ if err != nil {
+ return err
+ }
+ defer rows.Close()
+
+ var id int
+ var docPath string
+ var title, meta sql.NullString
+ var dateEpoch, filetimeEpoch sql.NullInt64
+
+ for rows.Next() {
+ if err := rows.Scan(&id, &docPath, &title, &dateEpoch, &filetimeEpoch, &meta); err != nil {
+ return err
+ }
+
+ doc := &index.Document{
+ Path: docPath,
+ }
+
+ if title.Valid {
+ doc.Title = title.String
+ }
+ if dateEpoch.Valid {
+ doc.Date = time.Unix(dateEpoch.Int64, 0)
+ }
+ if filetimeEpoch.Valid {
+ doc.FileTime = time.Unix(filetimeEpoch.Int64, 0)
+ }
+ if meta.Valid {
+ doc.OtherMeta = meta.String
+ }
+
+ f.docs[docPath] = doc
+ f.ids[docPath] = id
+ }
+
+ return nil
+}
+
+func (f Fill) authors(ctx context.Context) error {
+ rows, err := f.Db.QueryContext(ctx, `
+ SELECT name
+ FROM Authors
+ JOIN DocumentAuthors
+ ON Authors.id = DocumentAuthors.authorId
+ WHERE docId = ?
+ `, f.id)
+ if err != nil {
+ return err
+ }
+ defer rows.Close()
+
+ var name string
+ authors := make([]string, 0, 4)
+ for rows.Next() {
+ if err := rows.Scan(&name); err != nil {
+ return err
+ }
+ authors = append(authors, name)
+ }
+
+ f.doc.Authors = authors
+
+ return nil
+}
+
+func (f FillMany) authors(ctx context.Context) error {
+ stmt, err := f.Db.PrepareContext(ctx, `
+ SELECT name
+ FROM Authors
+ JOIN DocumentAuthors
+ ON Authors.id = DocumentAuthors.authorId
+ WHERE docId = ?
+ `)
+ if err != nil {
+ return err
+ }
+ defer stmt.Close()
+
+ // PERF: parallelize
+ var name string
+ for path, id := range f.ids {
+ rows, err := stmt.QueryContext(ctx, id)
+ if err != nil {
+ return err
+ }
+
+ doc := f.docs[path]
+ for rows.Next() {
+ if err := rows.Scan(&name); err != nil {
+ rows.Close()
+ return err
+ }
+
+ doc.Authors = append(doc.Authors, name)
+ }
+
+ rows.Close()
+ }
+
+ return nil
+}
+
+func (f Fill) tags(ctx context.Context) error {
+ rows, err := f.Db.QueryContext(ctx, `
+ SELECT name
+ FROM Tags
+ JOIN DocumentTags
+ ON Tags.id = DocumentTags.tagId
+ WHERE docId = ?
+ `, f.id)
+ if err != nil {
+ panic(err)
+ }
+ defer rows.Close()
+
+ var tag string
+ tags := make([]string, 0, 2)
+ for rows.Next() {
+ if err := rows.Scan(&tag); err != nil {
+ return err
+ }
+ tags = append(tags, tag)
+ }
+
+ f.doc.Tags = tags
+
+ return nil
+}
+
+func (f FillMany) tags(ctx context.Context) error {
+ stmt, err := f.Db.PrepareContext(ctx, `
+ SELECT name
+ FROM Tags
+ JOIN DocumentTags
+ ON Tags.id = DocumentTags.tagId
+ WHERE docId = ?
+ `)
+ if err != nil {
+ return err
+ }
+ defer stmt.Close()
+
+ // PERF: parallelize
+ var tag string
+ for docPath, id := range f.ids {
+ rows, err := stmt.QueryContext(ctx, id)
+ if err != nil {
+ return err
+ }
+
+ doc := f.docs[docPath]
+ for rows.Next() {
+ if err := rows.Scan(&tag); err != nil {
+ rows.Close()
+ return err
+ }
+
+ doc.Tags = append(doc.Tags, tag)
+ }
+
+ rows.Close()
+ }
+
+ return nil
+}
+
+func (f Fill) links(ctx context.Context) error {
+ rows, err := f.Db.QueryContext(ctx, `
+ SELECT path
+ FROM Documents
+ JOIN Links
+ ON Links.referencedId = Documents.id
+ WHERE Links.refererId = ?
+ `, f.id)
+ if err != nil {
+ return err
+ }
+ defer rows.Close()
+
+ var link string
+ links := make([]string, 0)
+ for rows.Next() {
+ if err := rows.Scan(&link); err != nil {
+ return err
+ }
+ links = append(links, link)
+ }
+ f.doc.Links = links
+
+ return nil
+}
+
+func (f FillMany) links(ctx context.Context) error {
+ stmt, err := f.Db.PrepareContext(ctx, `
+ SELECT path
+ FROM Documents
+ JOIN Links
+ ON Links.referencedId = Documents.id
+ WHERE Links.refererId = ?
+ `)
+ if err != nil {
+ return err
+ }
+ defer stmt.Close()
+
+ // PERF: parallelize
+ var linkPath string
+ for path, id := range f.ids {
+ rows, err := stmt.QueryContext(ctx, id)
+ if err != nil {
+ return err
+ }
+
+ doc := f.docs[path]
+ for rows.Next() {
+ if err := rows.Scan(&linkPath); err != nil {
+ rows.Close()
+ return err
+ }
+ doc.Links = append(doc.Links, linkPath)
+ }
+
+ rows.Close()
+ }
+
+ return nil
+}
diff --git a/pkg/data/get_test.go b/pkg/data/get_test.go
new file mode 100644
index 0000000..14d6920
--- /dev/null
+++ b/pkg/data/get_test.go
@@ -0,0 +1,217 @@
+package data_test
+
+import (
+ "context"
+ "database/sql"
+ "errors"
+ "testing"
+ "time"
+
+ "github.com/jpappel/atlas/pkg/data"
+ "github.com/jpappel/atlas/pkg/index"
+)
+
+func singleDoc(t *testing.T) *sql.DB {
+ t.Helper()
+ db := data.NewMemDB()
+
+ if _, err := db.Exec(`
+ INSERT INTO Documents (path, title, date, fileTime)
+ VALUES ("/file", "A file", 1, 2)
+ `); err != nil {
+ t.Fatal("err inserting doc:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO Authors (name)
+ VALUES ("jp")
+ `); err != nil {
+ t.Fatal("err inserting author:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO Aliases (authorId, alias)
+ VALUES (1,"pj"), (1,"JP")
+ `); err != nil {
+ t.Fatal("err inserting aliases:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO Tags (name)
+ VALUES ("foo"), ("bar"), ("baz"), ("oof")
+ `); err != nil {
+ t.Fatal("err inserting tags:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO DocumentAuthors (docId, authorId)
+ VALUES (1, 1)
+ `); err != nil {
+ t.Fatal("err inserting docAuthors:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO DocumentTags (docId, tagId)
+ VALUES (1,1), (1,2), (1,3), (1,4)
+ `); err != nil {
+ t.Fatal("err inserting docTags:", err)
+ }
+
+ return db
+}
+
+func multiDoc(t *testing.T) *sql.DB {
+ t.Helper()
+ db := data.NewMemDB()
+
+ if _, err := db.Exec(`
+ INSERT INTO Documents (path, title, date, fileTime)
+ VALUES ("/notes/anote.md", "A note", 1, 2), ("README.md", "read this file!", 3, 4)
+ `); err != nil {
+ t.Fatal("err inserting doc:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO Authors (name)
+ VALUES ("jp"), ("anonymous")
+ `); err != nil {
+ t.Fatal("err inserting author:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO Aliases (authorId, alias)
+ VALUES (1,"pj"), (1,"JP")
+ `); err != nil {
+ t.Fatal("err inserting aliases:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO Tags (name)
+ VALUES ("foo"), ("bar"), ("baz"), ("oof")
+ `); err != nil {
+ t.Fatal("err inserting tags:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO DocumentAuthors (docId, authorId)
+ VALUES (1, 1), (2, 2), (2, 1)
+ `); err != nil {
+ t.Fatal("err inserting docAuthors:", err)
+ }
+
+ if _, err := db.Exec(`
+ INSERT INTO DocumentTags (docId, tagId)
+ VALUES (1,1), (2,2), (1,3), (2,4)
+ `); err != nil {
+ t.Fatal("err inserting docTags:", err)
+ }
+
+ return db
+}
+
+func TestFill_Get(t *testing.T) {
+ tests := []struct {
+ name string
+ newFill func(t *testing.T) data.Fill
+ want index.Document
+ wantErr error
+ }{
+ {
+ "single doc",
+ func(t *testing.T) data.Fill {
+ t.Helper()
+ return data.Fill{Path: "/file", Db: singleDoc(t)}
+ },
+ index.Document{
+ Path: "/file",
+ Title: "A file",
+ Date: time.Unix(1, 0),
+ FileTime: time.Unix(2, 0),
+ Authors: []string{"jp"},
+ Tags: []string{"foo", "bar", "oof", "baz"},
+ },
+ nil,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ f := tt.newFill(t)
+ got, gotErr := f.Get(context.Background())
+
+ if !errors.Is(gotErr, tt.wantErr) {
+ t.Fatalf("Recieved unexpected error: got %v want %v", gotErr, tt.wantErr)
+ } else if gotErr != nil {
+ return
+ }
+
+ if !got.Equal(tt.want) {
+ t.Errorf("Get() = %+v\nWant %+v", got, tt.want)
+ }
+ })
+ }
+}
+
+func TestFillMany_Get(t *testing.T) {
+ tests := []struct {
+ name string
+ newFillMany func(t *testing.T) data.FillMany
+ want map[string]*index.Document
+ wantErr error
+ }{
+ {
+ "multi doc",
+ func(t *testing.T) data.FillMany {
+ t.Helper()
+ return data.FillMany{Db: multiDoc(t)}
+ },
+ map[string]*index.Document{
+ "/notes/anote.md": {
+ Path: "/notes/anote.md",
+ Title: "A note",
+ Date: time.Unix(1, 0),
+ FileTime: time.Unix(2, 0),
+ Authors: []string{"jp"},
+ Tags: []string{"foo", "baz"},
+ },
+ "README.md": {
+ Path: "README.md",
+ Title: "read this file!",
+ Date: time.Unix(3, 0),
+ FileTime: time.Unix(4, 0),
+ Authors: []string{"anonymous", "jp"},
+ Tags: []string{"bar", "oof"},
+ },
+ },
+ nil,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ctx := context.Background()
+ f := tt.newFillMany(t)
+
+ got, gotErr := f.Get(ctx)
+ if !errors.Is(gotErr, tt.wantErr) {
+ t.Fatalf("Recieved unexpected error: got %v want %v", gotErr, tt.wantErr)
+ } else if gotErr != nil {
+ return
+ }
+
+ if len(tt.want) != len(got) {
+ t.Errorf("Recieved incorrect amount of documents: got %d want %d", len(got), len(tt.want))
+ }
+
+ for path, wantDoc := range tt.want {
+ gotDoc, ok := got[path]
+ if !ok {
+ t.Errorf("Can't find %s in recieved docs", path)
+ continue
+ }
+
+ if !gotDoc.Equal(*wantDoc) {
+ t.Errorf("%s not equal %+v\nWant %+v", path, gotDoc, wantDoc)
+ }
+ }
+ })
+ }
+}
diff --git a/pkg/data/put.go b/pkg/data/put.go
new file mode 100644
index 0000000..f3fe4b6
--- /dev/null
+++ b/pkg/data/put.go
@@ -0,0 +1,384 @@
+package data
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+
+ "github.com/jpappel/atlas/pkg/index"
+)
+
+// TODO: rename struct
+type Put struct {
+ Id int64
+ Doc index.Document
+ tx *sql.Tx
+}
+
+// TODO: rename struct
+type PutMany struct {
+ Docs map[int64]*index.Document
+ pathDocs map[string]*index.Document
+ db *sql.DB
+}
+
+func NewPut(ctx context.Context, db *sql.DB, doc index.Document) (Put, error) {
+ tx, err := db.BeginTx(ctx, nil)
+ if err != nil {
+ return Put{}, nil
+ }
+ p := Put{Doc: doc, tx: tx}
+ return p, nil
+}
+
+func NewPutMany(db *sql.DB, documents map[string]*index.Document) (PutMany, error) {
+ docs := make(map[int64]*index.Document, len(documents))
+ p := PutMany{
+ Docs: docs,
+ pathDocs: documents,
+ db: db,
+ }
+ return p, nil
+}
+
+func (p Put) Insert() error {
+ if err := p.document(); err != nil {
+ p.tx.Rollback()
+ return err
+ }
+
+ if err := p.tags(); err != nil {
+ p.tx.Rollback()
+ return err
+ }
+
+ if err := p.links(); err != nil {
+ p.tx.Rollback()
+ return err
+ }
+
+ if err := p.authors(); err != nil {
+ p.tx.Rollback()
+ return err
+ }
+
+ return p.tx.Commit()
+}
+
+func (p PutMany) Insert(ctx context.Context) error {
+ if err := p.documents(ctx); err != nil {
+ return err
+ }
+
+ if err := p.tags(ctx); err != nil {
+ return err
+ }
+
+ if err := p.links(ctx); err != nil {
+ return err
+ }
+
+ if err := p.authors(ctx); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (p *Put) document() error {
+ title := sql.NullString{String: p.Doc.Title, Valid: p.Doc.Title != ""}
+
+ dateUnix := p.Doc.Date.Unix()
+ date := sql.NullInt64{Int64: dateUnix, Valid: dateUnix != 0}
+
+ filetimeUnix := p.Doc.FileTime.Unix()
+ filetime := sql.NullInt64{Int64: filetimeUnix, Valid: filetimeUnix != 0}
+
+ meta := sql.NullString{String: p.Doc.OtherMeta, Valid: p.Doc.OtherMeta != ""}
+
+ result, err := p.tx.Exec(`
+ INSERT INTO Documents(path, title, date, fileTime, meta)
+ VALUES (?,?,?,?,?)
+ `, p.Doc.Path, title, date, filetime, meta)
+ if err != nil {
+ return err
+ }
+
+ p.Id, err = result.LastInsertId()
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (p *PutMany) documents(ctx context.Context) error {
+ stmt, err := p.db.PrepareContext(ctx, `
+ INSERT INTO Documents(path, title, date, fileTime, meta)
+ VALUES (?,?,?,?,?)
+ `)
+ if err != nil {
+ return err
+ }
+ defer stmt.Close()
+
+ tx, err := p.db.BeginTx(ctx, nil)
+ if err != nil {
+ return err
+ }
+
+ txStmt := tx.StmtContext(ctx, stmt)
+
+ // PERF: profile this, grabbing the docId here might save time by simpliyfying
+ // future inserts
+ for _, doc := range p.pathDocs {
+ title := sql.NullString{String: doc.Title, Valid: doc.Title != ""}
+ dateUnix := doc.Date.Unix()
+ date := sql.NullInt64{Int64: dateUnix, Valid: dateUnix != 0}
+
+ filetimeUnix := doc.FileTime.Unix()
+ filetime := sql.NullInt64{Int64: filetimeUnix, Valid: filetimeUnix != 0}
+
+ meta := sql.NullString{String: doc.OtherMeta, Valid: doc.OtherMeta != ""}
+
+ res, err := txStmt.ExecContext(ctx, doc.Path, title, date, filetime, meta)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ id, err := res.LastInsertId()
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ p.Docs[id] = doc
+ }
+
+ return tx.Commit()
+}
+
+func (p Put) tags() error {
+ query, args := BatchQuery("INSERT OR IGNORE INTO Tags (name) VALUES", "", "(?)", ",", "", len(p.Doc.Tags), p.Doc.Tags)
+ if _, err := p.tx.Exec(query, args...); err != nil {
+ return err
+ }
+
+ preQuery := fmt.Sprintf(`
+ INSERT INTO DocumentTags
+ SELECT %d, Tags.id
+ FROM Tags
+ WHERE name IN
+ `, p.Id)
+
+ query, args = BatchQuery(preQuery, "(", "?", ",", ")", len(p.Doc.Tags), p.Doc.Tags)
+ if _, err := p.tx.Exec(query, args...); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (p PutMany) tags(ctx context.Context) error {
+ newTagStmt, err := p.db.PrepareContext(ctx, "INSERT OR IGNORE INTO Tags (name) VALUES (?)")
+ if err != nil {
+ return err
+ }
+ defer newTagStmt.Close()
+
+ tx, err := p.db.BeginTx(ctx, nil)
+ if err != nil {
+ return err
+ }
+
+ txNewTagStmt := tx.StmtContext(ctx, newTagStmt)
+
+ for id, doc := range p.Docs {
+ for _, tag := range doc.Tags {
+ if _, err := txNewTagStmt.ExecContext(ctx, tag); err != nil {
+ tx.Rollback()
+ return err
+ }
+ }
+
+ preQuery := fmt.Sprintf(`
+ INSERT INTO DocumentTags (docId, tagId)
+ SELECT %d, Tags.id
+ FROM Tags
+ WHERE name IN
+ `, id)
+ query, args := BatchQuery(preQuery, "(", "?", ",", ")", len(doc.Tags), doc.Tags)
+ if _, err := tx.Exec(query, args); err != nil {
+ tx.Rollback()
+ return err
+ }
+ }
+
+ return tx.Commit()
+}
+
+func (p Put) links() error {
+ if len(p.Doc.Links) == 0 {
+ return nil
+ }
+
+ preQuery := fmt.Sprintf(`
+ INSERT INTO Links (referencedId, refererId)
+ SELECT id, %d
+ FROM Documents
+ WHERE path IN
+ `, p.Id)
+ query, args := BatchQuery(preQuery, "(", "?", ",", ")", len(p.Doc.Links), p.Doc.Links)
+ if _, err := p.tx.Exec(query, args...); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (p PutMany) links(ctx context.Context) error {
+ tx, err := p.db.BeginTx(ctx, nil)
+ if err != nil {
+ return err
+ }
+
+ for id, doc := range p.Docs {
+ preQuery := fmt.Sprintf(`
+ INSERT INTO Links (referencedId, refererId)
+ SELECT id, %d
+ FROM Documents
+ WHERE path IN
+ `, id)
+ query, args := BatchQuery(preQuery, "(", "?", ",", ")", len(doc.Links), doc.Links)
+ if _, err := tx.Exec(query, args...); err != nil {
+ tx.Rollback()
+ return err
+ }
+ }
+
+ return tx.Commit()
+}
+
+func (p Put) authors() error {
+ // TODO: consider using temp table instead of cte
+ namesCTE, args := BatchQuery("WITH names(n) AS",
+ "( VALUES ", "(?)", ",", "),", len(p.Doc.Authors), p.Doc.Authors)
+
+ newAuthorsQuery := namesCTE + `
+ filtered_names AS (
+ SELECT n
+ FROM names
+ LEFT JOIN Authors on Authors.name = n
+ LEFT JOIN Aliases on Aliases.alias = n
+ WHERE Authors.name IS NULL AND Aliases.alias IS NULL
+ )
+ INSERT INTO Authors(name)
+ SELECT n FROM filtered_names
+ `
+ if _, err := p.tx.Exec(newAuthorsQuery, args...); err != nil {
+ return err
+ }
+
+ docAuthorsQuery := namesCTE + fmt.Sprintf(`
+ matched_authors AS (
+ SELECT Authors.id AS author_id
+ FROM Authors
+ LEFT JOIN Aliases
+ ON Authors.id = Aliases.authorId
+ JOIN names
+ ON Authors.name = n OR Aliases.alias = n
+ )
+ INSERT INTO DocumentAuthors(docId, authorId)
+ SELECT %d, author_id FROM matched_authors
+ `, p.Id)
+ if _, err := p.tx.Exec(docAuthorsQuery, args...); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (p PutMany) authors(ctx context.Context) error {
+ tx, err := p.db.BeginTx(ctx, nil)
+ if err != nil {
+ return err
+ }
+
+ _, err = p.db.Exec("CREATE TEMPORARY TABLE names (name TEXT UNIQUE NOT NULL)")
+ // _, err = tx.Exec("CREATE TEMPORARY TABLE names (name TEXT UNIQUE NOT NULL)")
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+ defer p.db.Exec("DROP TABLE IF EXISTS temp.names")
+
+ nameStmt, err := p.db.PrepareContext(ctx, "INSERT OR IGNORE INTO temp.names VALUES (?)")
+ if err != nil {
+ return err
+ }
+ defer nameStmt.Close()
+
+ txNameStmt := tx.StmtContext(ctx, nameStmt)
+ for _, doc := range p.Docs {
+ for _, name := range doc.Authors {
+ if _, err := txNameStmt.Exec(name); err != nil {
+ tx.Rollback()
+ return err
+ }
+ }
+ }
+
+ newAuthorsQuery := `
+ WITH new_names AS (
+ SELECT name
+ FROM temp.names
+ LEFT JOIN Authors on Authors.name = temp.names.name
+ LEFT JOIN Aliases on Aliases.alias = tmep.names.name
+ WHERE Authors.name IS NULL AND Aliases.alias IS NULL
+ )
+ INSERT INTO Authors(name)
+ SELECT name FROM new_names
+ `
+
+ if _, err := tx.Exec(newAuthorsQuery); err != nil {
+ tx.Rollback()
+ return err
+ }
+
+ _, err = tx.Exec(`
+ CREATE TEMPORARY TABLE name_ids AS
+ SELECT names.name AS name, COALESCE(Authors.id, Aliases.authorId) AS authorId
+ FROM names
+ LEFT JOIN Authors ON temp.names.name = Authors.name
+ LEFT JOIN Aliases ON temp.names.name = Aliases.name
+ `)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+ defer p.db.Exec("DROP TABLE IF EXISTS temp.name_ids")
+
+ docAuthorsStmt, err := p.db.Prepare(`
+ INSERT INTO DocumentAuthors (docId, authorId)
+ SELECT ?, authorId
+ FROM temp.name_ids
+ WHERE name = ?
+ `)
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+ defer docAuthorsStmt.Close()
+
+ for id, doc := range p.Docs {
+ for _, name := range doc.Authors {
+ if _, err := tx.Stmt(docAuthorsStmt).Exec(id, name); err != nil {
+ tx.Rollback()
+ return err
+ }
+ }
+ }
+
+ return tx.Commit()
+}
diff --git a/pkg/data/put_test.go b/pkg/data/put_test.go
new file mode 100644
index 0000000..7e5ad38
--- /dev/null
+++ b/pkg/data/put_test.go
@@ -0,0 +1,98 @@
+package data_test
+
+import (
+ "context"
+ "database/sql"
+ "errors"
+ "testing"
+ "time"
+
+ "github.com/jpappel/atlas/pkg/data"
+ "github.com/jpappel/atlas/pkg/index"
+)
+
+func TestPut_Insert(t *testing.T) {
+ tests := []struct {
+ name string
+ newDb func(t *testing.T) *sql.DB
+ doc index.Document
+ wantErr error
+ }{
+ {
+ "insert on empty",
+ func(t *testing.T) *sql.DB {
+ t.Helper()
+ return data.NewMemDB()
+ },
+ index.Document{
+ Path: "/file",
+ Title: "A file",
+ Date: time.Unix(1, 0),
+ FileTime: time.Unix(2, 0),
+ Authors: []string{"jp"},
+ Tags: []string{"foo", "bar", "oof", "baz"},
+ },
+ nil,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ctx := context.Background()
+ db := tt.newDb(t)
+ defer db.Close()
+
+ p, err := data.NewPut(ctx, db, tt.doc)
+ if err != nil {
+ t.Fatalf("could not construct receiver type: %v", err)
+ }
+
+ gotErr := p.Insert()
+ if !errors.Is(gotErr, tt.wantErr) {
+ t.Fatalf("Unexpected error on Insert():, want %v got %v", tt.wantErr, gotErr)
+ } else if gotErr != nil {
+ return
+ }
+
+ f := data.Fill{Path: tt.doc.Path, Db: db}
+ gotDoc, err := f.Get(ctx)
+ if err != nil {
+ t.Fatal("Error while retrieving document for comparison:", err)
+ }
+
+ if !gotDoc.Equal(tt.doc) {
+ t.Errorf("Retrieved doc is not stored doc!\nrecv: %+v\nsent: %+v", gotDoc, tt.doc)
+ }
+ })
+ }
+}
+
+func TestPutMany_Insert(t *testing.T) {
+ tests := []struct {
+ name string // description of this test case
+ // Named input parameters for receiver constructor.
+ db *sql.DB
+ documents map[string]*index.Document
+ wantErr bool
+ }{
+ // TODO: Add test cases.
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ p, err := data.NewPutMany(tt.db, tt.documents)
+ if err != nil {
+ t.Fatalf("could not construct receiver type: %v", err)
+ }
+ gotErr := p.Insert(context.Background())
+ if gotErr != nil {
+ if !tt.wantErr {
+ t.Errorf("Insert() failed: %v", gotErr)
+ }
+ return
+ }
+ if tt.wantErr {
+ t.Fatal("Insert() succeeded unexpectedly")
+ }
+ })
+ }
+}
+
diff --git a/pkg/index/filters.go b/pkg/index/filters.go
new file mode 100644
index 0000000..f59a5d6
--- /dev/null
+++ b/pkg/index/filters.go
@@ -0,0 +1,84 @@
+package index
+
+import (
+ "io"
+ "path/filepath"
+)
+
+// TODO: create excluded path filter factory
+
+type DocFilter func(infoPath, io.ReadSeeker) bool
+
+func NewExtensionFilter(ext string) DocFilter {
+ return func(ip infoPath, _ io.ReadSeeker) bool {
+ return filepath.Ext(ip.path) == ext
+ }
+}
+
+func NewMaxFilesizeFilter(size int64) DocFilter {
+ return func(ip infoPath, _ io.ReadSeeker) bool {
+ return ip.info.Size() <= size
+ }
+}
+
+func YamlHeaderFilter(_ infoPath, r io.ReadSeeker) bool {
+ const bufSize = 4096
+ buf := make([]byte, bufSize)
+
+ carry := make([]byte, 4)
+ cmp := make([]byte, 4)
+ n, err := r.Read(carry)
+ if err != nil || n < 4 || string(carry) != "---\n" {
+ return false
+ }
+
+ headerFound := false
+ readMore := true
+ for readMore {
+ buf = buf[:bufSize]
+ n, err := r.Read(buf)
+ if err == io.EOF {
+ readMore = false
+ } else if err != nil {
+ return false
+ }
+ buf = buf[:n]
+
+ // PERF: the carry doesn't need to be checked on the first loop iteration
+ for i := range min(4, n) {
+ b := carry[i]
+ for j := range 4 {
+ if i+j < 4 {
+ cmp[j] = carry[i+j]
+ } else {
+ cmp[j] = buf[(i+j)%4]
+ }
+ }
+ if b == '\n' && string(cmp) == "\n---\n" {
+ headerFound = true
+ readMore = false
+ break
+ }
+ }
+ for i := range n - 4 {
+ b := buf[i]
+ if b == '\n' && string(buf[i:i+5]) == "\n---\n" {
+ headerFound = true
+ readMore = false
+ break
+ }
+ }
+
+ if readMore {
+ for i := range 4 {
+ carry[i] = buf[n-4+i]
+ }
+ }
+ }
+
+ return headerFound
+}
+
+func DefaultFilters() []DocFilter {
+ return []DocFilter{NewExtensionFilter(".md"), NewMaxFilesizeFilter(200 * 1024)}
+}
diff --git a/pkg/index/filters_test.go b/pkg/index/filters_test.go
new file mode 100644
index 0000000..897a82f
--- /dev/null
+++ b/pkg/index/filters_test.go
@@ -0,0 +1,112 @@
+package index
+
+import (
+ "bytes"
+ "io"
+ "os"
+ "testing"
+)
+
+func noYamlHeader() io.ReadSeeker {
+ buf := []byte("just some text")
+ return bytes.NewReader(buf)
+}
+
+func incompleteYamlHeader() io.ReadSeeker {
+ buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---")
+ return bytes.NewReader(buf)
+}
+
+func completeYamlHeader() io.ReadSeeker {
+ buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\n")
+ return bytes.NewReader(buf)
+}
+
+func trailingYamlHeader() io.ReadSeeker {
+ buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\nhere are some content\nanother line of text")
+ return bytes.NewReader(buf)
+}
+
+func extensionless(t *testing.T) infoPath {
+ root := t.TempDir()
+ path := root + "/" + "afile"
+ f, err := os.Create(path)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer f.Close()
+
+ if _, err := f.WriteString("this is a file"); err != nil {
+ t.Fatal(err)
+ }
+
+ info, err := f.Stat()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ return infoPath{path, info}
+}
+
+func markdownExtension(t *testing.T) infoPath {
+ root := t.TempDir()
+ path := root + "/" + "a.md"
+ f, err := os.Create(path)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer f.Close()
+
+ info, err := f.Stat()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ return infoPath{path, info}
+}
+
+func TestYamlHeaderFilter(t *testing.T) {
+ tests := []struct {
+ name string
+ r io.ReadSeeker
+ want bool
+ }{
+ {"completeYamlHeader", completeYamlHeader(), true},
+ {"trailingYamlHeader", trailingYamlHeader(), true},
+ {"noYamlHeader", noYamlHeader(), false},
+ {"incompleteYamlHeader", incompleteYamlHeader(), false},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := YamlHeaderFilter(infoPath{}, tt.r)
+ if got != tt.want {
+ t.Errorf("YamlHeaderFilter() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
+
+func TestExtensionFilter(t *testing.T) {
+ tests := []struct {
+ name string
+ infoGen func(*testing.T) infoPath
+ ext string
+ want bool
+ }{
+ {"no extension, accept .md", extensionless, ".md", false},
+ {"no extension, accept all", extensionless, "", true},
+ {"markdown, accept .md", markdownExtension, ".md", true},
+ {"makdown, accept .png", markdownExtension, ".png", false},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ filter := NewExtensionFilter(tt.ext)
+ ip := tt.infoGen(t)
+ got := filter(ip, nil)
+
+ if got != tt.want {
+ t.Errorf("ExtensionFilter() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
diff --git a/pkg/index/index.go b/pkg/index/index.go
new file mode 100644
index 0000000..074b659
--- /dev/null
+++ b/pkg/index/index.go
@@ -0,0 +1,275 @@
+package index
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "slices"
+ "sync"
+ "time"
+
+ "github.com/goccy/go-yaml"
+)
+
+type Document struct {
+ Path string
+ Title string `yaml:"title"`
+ Date time.Time `yaml:"date"`
+ FileTime time.Time
+ Authors []string `yaml:"authors"`
+ Tags []string `yaml:"tags"`
+ Links []string
+ OtherMeta string // unsure about how to handle this
+}
+
+type infoPath struct {
+ path string
+ info os.FileInfo
+}
+
+type Index struct {
+ Root string // root directory for searching
+ Documents map[string]*Document
+ Filters []DocFilter
+}
+
+func (idx Index) String() string {
+ // TODO: print info about active filters
+ return fmt.Sprintf("%s Documents[%d] Filters[%d]", idx.Root, len(idx.Documents), len(idx.Filters))
+}
+
+func (doc Document) Equal(other Document) bool {
+ if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) || !doc.FileTime.Equal(other.FileTime) {
+ return false
+ }
+
+ if !slices.Equal(doc.Authors, other.Authors) {
+ return false
+ }
+
+ slices.Sort(doc.Tags)
+ slices.Sort(other.Tags)
+ for i := range doc.Tags {
+ if doc.Tags[i] != other.Tags[i] {
+ return false
+ }
+ }
+
+ slices.Sort(doc.Links)
+ slices.Sort(other.Links)
+ for i := range doc.Links {
+ if doc.Links[i] != other.Links[i] {
+ return false
+ }
+ }
+
+ return true
+}
+
+func visit(file infoPath, visitQueue chan<- infoPath, filterQueue chan<- infoPath, wg *sync.WaitGroup) {
+ // TODO: check if symlink, and handle appropriately
+ // TODO: extract error out of function
+
+ if file.info.IsDir() {
+ entries, err := os.ReadDir(file.path)
+ if err != nil {
+ panic(err)
+ }
+
+ wg.Add(len(entries))
+ for _, entry := range entries {
+ entryInfo, err := entry.Info()
+ if err != nil {
+ panic(err)
+ }
+ // PERF: prevents deadlock but introduces an additional goroutine overhead per file
+ go func(path string) {
+ visitQueue <- infoPath{path: path, info: entryInfo}
+ }(file.path + "/" + entry.Name())
+ }
+ } else if file.info.Mode().IsRegular() {
+ filterQueue <- file
+ }
+
+ wg.Done()
+}
+
+func workerTraverse(wg *sync.WaitGroup, visitQueue chan infoPath, filterQueue chan<- infoPath) {
+ for work := range visitQueue {
+ visit(work, visitQueue, filterQueue, wg)
+ }
+}
+
+func (idx Index) Traverse(numWorkers uint) []string {
+ if numWorkers <= 1 {
+ panic(fmt.Sprint("Invalid number of workers: ", numWorkers))
+ }
+ docs := make([]string, 0)
+
+ rootInfo, err := os.Stat(idx.Root)
+ if err != nil {
+ panic(err)
+ }
+
+ jobs := make(chan infoPath, numWorkers)
+ filterQueue := make(chan infoPath, numWorkers)
+
+ activeJobs := &sync.WaitGroup{}
+
+ // start workers
+ for range numWorkers {
+ go workerTraverse(activeJobs, jobs, filterQueue)
+ }
+
+ // init send
+ activeJobs.Add(1)
+ jobs <- infoPath{path: idx.Root, info: rootInfo}
+
+ // TODO: close jobs queue
+ go func() {
+ activeJobs.Wait()
+ close(jobs)
+ close(filterQueue)
+ }()
+
+ // gather
+ for doc := range filterQueue {
+ docs = append(docs, doc.path)
+ }
+
+ return docs
+}
+
+func (idx Index) FilterOne(path string) bool {
+ info, err := os.Stat(string(path))
+ if err != nil {
+ return false
+ }
+
+ f, err := os.Open(string(path))
+ if err != nil {
+ return false
+ }
+ defer f.Close()
+
+ for _, filter := range idx.Filters {
+ if !filter(infoPath{string(path), info}, f) {
+ return false
+ }
+ if _, err := f.Seek(0, io.SeekStart); err != nil {
+ return false
+ }
+ }
+ return true
+}
+
+func (idx Index) Filter(paths []string, numWorkers uint) []string {
+ fPaths := make([]string, 0, len(paths))
+ jobs := make(chan string, numWorkers)
+ accepted := make(chan string, numWorkers)
+ wg := &sync.WaitGroup{}
+
+ wg.Add(int(numWorkers))
+ for range numWorkers {
+ go func(jobs <-chan string, accepted chan<- string, wg *sync.WaitGroup) {
+ for path := range jobs {
+ if idx.FilterOne(path) {
+ accepted <- path
+ }
+ }
+ wg.Done()
+ }(jobs, accepted, wg)
+ }
+
+ go func(jobs chan<- string) {
+ for _, path := range paths {
+ jobs <- path
+ }
+ close(jobs)
+ }(jobs)
+
+ go func() {
+ wg.Wait()
+ close(accepted)
+ }()
+
+ for path := range accepted {
+ fPaths = append(fPaths, path)
+ }
+
+ return fPaths
+}
+
+func (idx Index) ParseOne(path string) (*Document, error) {
+ doc := &Document{}
+ doc.Path = path
+
+ f, err := os.Open(string(path))
+ if err != nil {
+ return nil, err
+ }
+
+ info, err := f.Stat()
+ if err != nil {
+ return nil, err
+ }
+ doc.FileTime = info.ModTime()
+
+ buf := make([]byte, 4, 1024)
+ n, err := f.Read(buf)
+ if err != nil {
+ return nil, err
+ } else if n != 4 {
+ return nil, errors.New("Short read")
+ }
+
+ // TODO: implement custom unmarshaller, for singular `Author`
+ dec := yaml.NewDecoder(f)
+ // TODO: handle no yaml header error
+ if err := dec.Decode(&doc); err != nil {
+ panic(err)
+ }
+
+ // TODO: body parsing
+
+ return doc, nil
+}
+
+func (idx Index) Parse(paths []string, numWorkers uint) {
+ jobs := make(chan string, numWorkers)
+ results := make(chan Document, numWorkers)
+ wg := &sync.WaitGroup{}
+
+ wg.Add(int(numWorkers))
+ for range numWorkers {
+ go func(jobs <-chan string, results chan<- Document, wg *sync.WaitGroup) {
+ for path := range jobs {
+ doc, err := idx.ParseOne(path)
+ if err != nil {
+ // TODO: propagate error
+ panic(err)
+ }
+
+ results <- *doc
+ }
+ wg.Done()
+ }(jobs, results, wg)
+ }
+
+ go func(jobs chan<- string, paths []string) {
+ for _, path := range paths {
+ jobs <- path
+ }
+ close(jobs)
+ }(jobs, paths)
+
+ go func(results chan Document, wg *sync.WaitGroup) {
+ wg.Wait()
+ close(results)
+ }(results, wg)
+
+ for doc := range results {
+ idx.Documents[doc.Path] = &doc
+ }
+}
diff --git a/pkg/index/index_test.go b/pkg/index/index_test.go
new file mode 100644
index 0000000..0b5d2f2
--- /dev/null
+++ b/pkg/index/index_test.go
@@ -0,0 +1,138 @@
+package index
+
+import (
+ "fmt"
+ "os"
+ "slices"
+ "testing"
+)
+
+var indexCases map[string]func(t *testing.T) Index
+
+func init() {
+ indexCases = make(map[string]func(t *testing.T) Index)
+
+ indexCases["single file"] = func(t *testing.T) Index {
+ root := t.TempDir()
+ index := Index{Root: root, Filters: []DocFilter{NewExtensionFilter(".md")}}
+
+ f, err := os.Create(root + "/a_file.md")
+ if err != nil {
+ t.Fatal(err)
+ }
+ f.WriteString("some file contents\n")
+
+ return index
+ }
+
+ indexCases["large file"] = func(t *testing.T) Index {
+ root := t.TempDir()
+ index := Index{Root: root}
+
+ return index
+ }
+
+ indexCases["worker saturation"] = func(t *testing.T) Index {
+ root := t.TempDir()
+ index := Index{Root: root}
+
+ permission := os.FileMode(0o777)
+ for _, dirName := range []string{"a", "b", "c", "d", "e", "f"} {
+ dir := root + "/" + dirName
+ if err := os.Mkdir(dir, permission); err != nil {
+ t.Fatal(err)
+ }
+ for i := range 8 {
+ fName := fmt.Sprint(dirName, i)
+ f, err := os.Create(dir + "/" + fName)
+ if err != nil {
+ t.Fatal(err)
+ }
+ f.WriteString(fName)
+ }
+ }
+
+ return index
+ }
+}
+
+func TestIndex_Traverse(t *testing.T) {
+ tests := []struct {
+ name string
+ indexCase func(t *testing.T) Index
+ numWorkers uint
+ want []string
+ }{
+ {name: "single file", indexCase: indexCases["single file"], numWorkers: 2, want: []string{"a_file.md"}},
+ {name: "saturation test", indexCase: indexCases["worker saturation"], numWorkers: 2, want: []string{
+ "a/a0", "a/a1", "a/a2", "a/a3", "a/a4", "a/a5", "a/a6", "a/a7",
+ "b/b0", "b/b1", "b/b2", "b/b3", "b/b4", "b/b5", "b/b6", "b/b7",
+ "c/c0", "c/c1", "c/c2", "c/c3", "c/c4", "c/c5", "c/c6", "c/c7",
+ "d/d0", "d/d1", "d/d2", "d/d3", "d/d4", "d/d5", "d/d6", "d/d7",
+ "e/e0", "e/e1", "e/e2", "e/e3", "e/e4", "e/e5", "e/e6", "e/e7",
+ "f/f0", "f/f1", "f/f2", "f/f3", "f/f4", "f/f5", "f/f6", "f/f7",
+ }},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ idx := tt.indexCase(t)
+ got := idx.Traverse(tt.numWorkers)
+
+ slices.Sort(got)
+ slices.Sort(tt.want)
+
+ n := min(len(got), len(tt.want))
+ if len(got) != len(tt.want) {
+ t.Errorf("Wanted %v got %v paths", len(tt.want), len(got))
+ t.Logf("Checking up to %d values", n)
+ }
+
+ for i := range n {
+ gotPath := got[i]
+ wantPath := idx.Root + "/" + tt.want[i]
+ if gotPath != wantPath {
+ t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath)
+ }
+ }
+ })
+ }
+}
+
+func TestIndex_Filter(t *testing.T) {
+ tests := []struct {
+ name string
+ paths []string
+ indexCase func(t *testing.T) Index
+ numWorkers uint
+ want []string
+ }{
+ {"single file", []string{"a_file.md"}, indexCases["single file"], 2, []string{"a_file.md"}},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ idx := tt.indexCase(t)
+ for i, path := range tt.paths {
+ tt.paths[i] = idx.Root + "/" + path
+ }
+
+ got := idx.Filter(tt.paths, tt.numWorkers)
+
+ slices.Sort(got)
+ slices.Sort(tt.want)
+
+ n := min(len(got), len(tt.want))
+ if len(got) != len(tt.want) {
+ t.Errorf("Wanted %v got %v paths", len(tt.want), len(got))
+ t.Logf("Checking up to %d values", n)
+ }
+
+ for i := range n {
+ gotPath := got[i]
+ wantPath := idx.Root + "/" + tt.want[i]
+ if gotPath != wantPath {
+ t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath)
+ }
+ }
+ })
+ }
+}
diff --git a/pkg/index/schema.sql b/pkg/index/schema.sql
new file mode 100644
index 0000000..fb06351
--- /dev/null
+++ b/pkg/index/schema.sql
@@ -0,0 +1,70 @@
+-- TABLE of config values
+CREATE TABLE Indexes(
+ root TEXT NOT NULL,
+ followSym DATE
+);
+
+-- Schema
+CREATE TABLE Documents(
+ id INTEGER PRIMARY KEY,
+ path TEXT UNIQUE NOT NULL,
+ title TEXT,
+ date INT,
+ fileTime INT,
+ meta BLOB
+);
+
+CREATE TABLE Authors(
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL
+);
+
+CREATE TABLE Aliases(
+ authorId INT NOT NULL,
+ alias TEXT UNIQUE NOT NULL,
+ FOREIGN KEY (authorId) REFERENCES Authors(id)
+);
+
+CREATE TABLE Tags(
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL,
+);
+
+CREATE TABLE Links(
+ referencedId INT,
+ refererId INT,
+ FOREIGN KEY (referencedId) REFERENCES Documents(id),
+ FOREIGN KEY (refererId) REFERENCES Documents(id)
+);
+
+CREATE TABLE DocumentAuthors(
+ docId INT NOT NULL,
+ authorId INT NOT NULL,
+ FOREIGN KEY (docId) REFERENCES Documents(id),
+ FOREIGN KEY (authorId) REFERENCES Authors(id)
+);
+
+CREATE TABLE DocumentTags(
+ docId INT NOT NULL,
+ tagId INT NOT NULL,
+ FOREIGN KEY (docId) REFERENCES Documents(id),
+ FOREIGN KEY (tagId) REFERENCES Tags(id),
+ UNIQUE(docId, tagId)
+);
+
+-- Indexes
+CREATE INDEX idx_doc_dates
+ON Documents (date);
+CREATE INDEX idx_doc_titles
+ON Documents (title);
+
+CREATE INDEX idx_author_name
+ON Authors(name);
+
+CREATE INDEX idx_aliases_alias
+ON Aliases(alias);
+CREATE INDEX idx_aliases_authorId
+ON Aliases(authorId);
+
+CREATE INDEX idx_doctags_tagid
+ON DocumentTags (tagId);
diff --git a/pkg/query/data_structures.go b/pkg/query/data_structures.go
new file mode 100644
index 0000000..c48ecde
--- /dev/null
+++ b/pkg/query/data_structures.go
@@ -0,0 +1,81 @@
+package query
+
+import "errors"
+
+// not threadsafe implementation of stack
+type nodeStack struct {
+ buf []*Node
+}
+
+func (s nodeStack) Push(n *Node) {
+ s.buf = append(s.buf, n)
+}
+func (s nodeStack) Pop() *Node {
+ last_index := len(s.buf) - 1
+ n := s.buf[last_index]
+ s.buf = s.buf[:last_index]
+ return n
+}
+func (s nodeStack) Peek() *Node {
+ return s.buf[len(s.buf)-1]
+}
+func (s nodeStack) IsEmpty() bool {
+ return len(s.buf) == 0
+}
+
+type nodeQueue struct {
+ buf []*Node
+ head int
+ tail int
+}
+
+func makeNodeQueue(initial *Node, cap int) nodeQueue {
+ if cap < 1 {
+ panic("Invalid nodeQueue Capacity")
+ }
+
+ q := nodeQueue{
+ buf: make([]*Node, 0, cap),
+ head: 0,
+ tail: 1,
+ }
+ q.buf[0] = initial
+
+ return q
+}
+
+func (q nodeQueue) Enqueue(n *Node) error {
+
+ q.buf[q.tail] = n
+ new_tail := (q.tail + 1) % len(q.buf)
+ if new_tail == q.head {
+ return errors.New("Queue out of capacity")
+ }
+
+ q.tail = new_tail
+ return nil
+}
+func (q nodeQueue) Dequeue() (*Node, error) {
+ if q.head == q.tail {
+ return nil, errors.New("Empty Queue")
+ }
+
+ n := q.buf[q.head]
+ q.head = (q.head + 1) % len(q.buf)
+ return n, nil
+}
+func (q nodeQueue) PeekHead() (*Node, error) {
+ if q.head == q.tail {
+ return nil, errors.New("Empty queue")
+ }
+ return q.buf[q.head], nil
+}
+func (q nodeQueue) PeekTail() (*Node, error) {
+ if q.head == q.tail {
+ return nil, errors.New("Empty Queue")
+ }
+ return q.buf[q.tail-1], nil
+}
+func (q nodeQueue) IsEmpty() bool {
+ return q.head == q.tail
+}
diff --git a/pkg/query/lang.md b/pkg/query/lang.md
new file mode 100644
index 0000000..a399cb8
--- /dev/null
+++ b/pkg/query/lang.md
@@ -0,0 +1,12 @@
+# Query Language Spec
+
+```
+<expr_list> := <expr> | <expr> <expr_list>
+
+<expr> := <statment> <bin_op> <statment>
+<statment> := <statement_start> {strings} <statment_end>
+<statment_start :=
+<statment_end> :=
+
+<bin_op> := "and" | "or" | "not" | "similar"
+```
diff --git a/pkg/query/parser.go b/pkg/query/parser.go
new file mode 100644
index 0000000..355b18c
--- /dev/null
+++ b/pkg/query/parser.go
@@ -0,0 +1,19 @@
+package query
+
+type TokenType uint64
+
+const (
+ TOKEN_ERROR TokenType = iota
+ TOKEN_EOF
+ TOKEN_AND
+ TOKEN_OR
+ TOKEN_NOT
+ TOKEN_SIMILAR
+ TOKEN_STATEMENT
+ // TODO: consider adding regex token
+)
+
+type Token struct {
+ Type TokenType
+ Content string
+}
diff --git a/pkg/query/query.go b/pkg/query/query.go
new file mode 100644
index 0000000..b712370
--- /dev/null
+++ b/pkg/query/query.go
@@ -0,0 +1,50 @@
+package query
+
+
+type Node struct {
+ Parent *Node
+ Children []*Node
+ Token
+}
+
+type AST struct {
+ root Node
+ size uint64
+}
+
+// Walk an ast depth first
+func (T AST) dfWalk() func() (*Node, bool) {
+ stack := nodeStack{make([]*Node, 0, T.size)}
+ stack.Push(&T.root)
+
+ return func() (*Node, bool) {
+ n := stack.Pop()
+ for _, child := range n.Children {
+ stack.Push(child)
+ }
+
+ if stack.IsEmpty() {
+ return n, false
+ }
+ return n, true
+ }
+}
+
+// Walk an ast breadth first
+func (T AST) bfWalk() func() (*Node, bool) {
+ queue := nodeQueue{}
+ queue.buf = make([]*Node, 0, T.size)
+ queue.Enqueue(&T.root)
+
+ return func() (*Node, bool) {
+ n, err := queue.Dequeue()
+ if err != nil {
+ return nil, false
+ }
+
+ for _, child := range n.Children {
+ queue.Enqueue(child)
+ }
+ return n, true
+ }
+}