From 34b8d8ff1f9d65c08a9156d72f08cf548183c6f4 Mon Sep 17 00:00:00 2001 From: JP Appel Date: Sun, 27 Apr 2025 00:49:27 -0400 Subject: Large commit; many features --- pkg/data/db.go | 259 +++++++++++++++++++++++++++++ pkg/data/db_test.go | 65 ++++++++ pkg/data/get.go | 330 +++++++++++++++++++++++++++++++++++++ pkg/data/get_test.go | 217 ++++++++++++++++++++++++ pkg/data/put.go | 384 +++++++++++++++++++++++++++++++++++++++++++ pkg/data/put_test.go | 98 +++++++++++ pkg/index/filters.go | 84 ++++++++++ pkg/index/filters_test.go | 112 +++++++++++++ pkg/index/index.go | 275 +++++++++++++++++++++++++++++++ pkg/index/index_test.go | 138 ++++++++++++++++ pkg/index/schema.sql | 70 ++++++++ pkg/query/data_structures.go | 81 +++++++++ pkg/query/lang.md | 12 ++ pkg/query/parser.go | 19 +++ pkg/query/query.go | 50 ++++++ 15 files changed, 2194 insertions(+) create mode 100644 pkg/data/db.go create mode 100644 pkg/data/db_test.go create mode 100644 pkg/data/get.go create mode 100644 pkg/data/get_test.go create mode 100644 pkg/data/put.go create mode 100644 pkg/data/put_test.go create mode 100644 pkg/index/filters.go create mode 100644 pkg/index/filters_test.go create mode 100644 pkg/index/index.go create mode 100644 pkg/index/index_test.go create mode 100644 pkg/index/schema.sql create mode 100644 pkg/query/data_structures.go create mode 100644 pkg/query/lang.md create mode 100644 pkg/query/parser.go create mode 100644 pkg/query/query.go (limited to 'pkg') diff --git a/pkg/data/db.go b/pkg/data/db.go new file mode 100644 index 0000000..fcf56f9 --- /dev/null +++ b/pkg/data/db.go @@ -0,0 +1,259 @@ +package data + +import ( + "context" + "database/sql" + "strings" + + "github.com/jpappel/atlas/pkg/index" + _ "github.com/mattn/go-sqlite3" +) + +type Query struct { + db *sql.DB +} + +// Append n copies of val to query +// +// output is in the form +// +// <(n-1)*(> +func BatchQuery[T any](query string, start string, val string, delim string, stop string, n int, baseArgs []T) (string, []any) { + args := make([]any, len(baseArgs)) + for i, arg := range baseArgs { + args[i] = arg + } + + b := strings.Builder{} + b.Grow(len(query) + 1 + len(start) + n*len(val) + ((n - 1) * len(delim)) + len(stop)) + + b.WriteString(query) + b.WriteRune(' ') + b.WriteString(start) + for range n - 1 { + b.WriteString(val) + b.WriteString(delim) + } + b.WriteString(val) + b.WriteString(stop) + + return b.String(), args +} + +func NewQuery(filename string) *Query { + query := &Query{NewDB(filename)} + return query +} + +func NewDB(filename string) *sql.DB { + connStr := "file:" + filename + "?_fk=true&_journal=WAL" + db, err := sql.Open("sqlite3", connStr) + if err != nil { + panic(err) + } + + if err := createSchema(db); err != nil { + panic(err) + } + + return db +} + +func NewMemDB() *sql.DB { + db, err := sql.Open("sqlite3", ":memory:?_fk=true") + if err != nil { + panic(err) + } + + if err := createSchema(db); err != nil { + panic(err) + } + + return db +} + +func createSchema(db *sql.DB) error { + ctx := context.TODO() + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return err + } + defer tx.Commit() + + _, err = tx.Exec(` + CREATE TABLE IF NOT EXISTS Indexes( + root TEXT NOT NULL, + followSym DATE + )`) + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec(` + CREATE TABLE IF NOT EXISTS Documents( + id INTEGER PRIMARY KEY, + path TEXT UNIQUE NOT NULL, + title TEXT, + date INT, + fileTime INT, + meta BLOB + )`) + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec(` + CREATE TABLE IF NOT EXISTS Authors( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL + )`) + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec(` + CREATE TABLE IF NOT EXISTS Aliases( + authorId INT NOT NULL, + alias TEXT UNIQUE NOT NULL, + FOREIGN KEY (authorId) REFERENCES Authors(id) + )`) + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec(` + CREATE TABLE IF NOT EXISTS Tags( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL + )`) + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec(` + CREATE TABLE IF NOT EXISTS Links( + referencedId INT, + refererId INT, + FOREIGN KEY (referencedId) REFERENCES Documents(id), + FOREIGN KEY (refererId) REFERENCES Documents(id) + )`) + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec(` + CREATE TABLE IF NOT EXISTS DocumentAuthors( + docId INT NOT NULL, + authorId INT NOT NULL, + FOREIGN KEY (docId) REFERENCES Documents(id), + FOREIGN KEY (authorId) REFERENCES Authors(id) + )`) + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec(` + CREATE TABLE IF NOT EXISTS DocumentTags( + docId INT NOT NULL, + tagId INT NOT NULL, + FOREIGN KEY (docId) REFERENCES Documents(id), + FOREIGN KEY (tagId) REFERENCES Tags(id), + UNIQUE(docId, tagId) + )`) + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_doc_dates ON Documents (date)") + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_doc_titles ON Documents (title)") + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_aliases_alias ON Aliases(alias)") + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_aliases_authorId ON Aliases(authorId)") + if err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_doctags_tagid ON DocumentTags (tagId)") + if err != nil { + tx.Rollback() + return err + } + + return nil +} + +func (q Query) Close() error { + return q.db.Close() +} + +// Create an index +func (q Query) Get(indexRoot string) (*index.Index, error) { + ctx := context.TODO() + + docs, err := FillMany{Db: q.db}.Get(ctx) + if err != nil { + return nil, err + } + + idx := &index.Index{ + Root: indexRoot, + Documents: docs, + Filters: index.DefaultFilters(), + } + + return idx, nil +} + +// Write from index to database +func (q Query) Put(idx index.Index) error { + ctx := context.TODO() + + insertCtx, cancel := context.WithCancelCause(ctx) + defer cancel(nil) + + p, err := NewPutMany(q.db, idx.Documents) + if err != nil { + return err + } + + if err := p.Insert(insertCtx); err != nil { + return err + } + + return nil +} + +// Update database with values from index +func (q Query) Update(idx index.Index) error { + // TODO: implement + return nil +} + +func (q Query) GetDocument(path string) (*index.Document, error) { + ctx := context.TODO() + f := Fill{Path: path, Db: q.db} + return f.Get(ctx) +} diff --git a/pkg/data/db_test.go b/pkg/data/db_test.go new file mode 100644 index 0000000..b90a943 --- /dev/null +++ b/pkg/data/db_test.go @@ -0,0 +1,65 @@ +package data_test + +import ( + "slices" + "testing" + + "github.com/jpappel/atlas/pkg/data" +) + +func TestBatchQuery(t *testing.T) { + tests := []struct { + name string + query string + start string + val string + delim string + stop string + n int + args []int + wantQuery string + wantArgs []any + }{ + { + "1 val group", + "INSERT INTO Foo VALUES", + "(", + "?", + ",", + ")", + 5, + []int{1, 2, 3, 4, 5}, + "INSERT INTO Foo VALUES (?,?,?,?,?)", + []any{1, 2, 3, 4, 5}, + }, + { + "multiple val groups", + "INSERT INTO Bar VALUES", + "", + "(?,?)", + ",", + "", + 2, + []int{1, 2, 3, 4}, + "INSERT INTO Bar VALUES (?,?),(?,?)", + []any{1, 2, 3, 4}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotQuery, gotArgs := data.BatchQuery(tt.query, tt.start, tt.val, tt.delim, tt.stop, tt.n, tt.args) + if gotQuery != tt.wantQuery { + t.Error("Got different query than wanted") + t.Log("Wanted:\n" + tt.wantQuery) + t.Log("Got:\n" + gotQuery) + } + + if !slices.Equal(tt.wantArgs, gotArgs) { + t.Error("Got different args than wanted") + t.Logf("Wanted:\t%v", tt.wantArgs) + t.Logf("Got:\t%v", gotArgs) + } + }) + } +} diff --git a/pkg/data/get.go b/pkg/data/get.go new file mode 100644 index 0000000..7cae03a --- /dev/null +++ b/pkg/data/get.go @@ -0,0 +1,330 @@ +package data + +import ( + "context" + "database/sql" + "time" + + "github.com/jpappel/atlas/pkg/index" +) + +// TODO: rename struct +// +// Use to build a document from a database connection +type Fill struct { + Path string + Db *sql.DB + id int + doc *index.Document +} + +// TODO: rename struct +// +// Use to build documents and aliases from a database connection +type FillMany struct { + docs map[string]*index.Document + ids map[string]int + Db *sql.DB +} + +func (f Fill) Get(ctx context.Context) (*index.Document, error) { + f.doc = &index.Document{Path: f.Path} + if err := f.document(ctx); err != nil { + return nil, err + } + if err := f.tags(ctx); err != nil { + return nil, err + } + if err := f.authors(ctx); err != nil { + return nil, err + } + if err := f.links(ctx); err != nil { + return nil, err + } + + return f.doc, nil +} + +func (f FillMany) Get(ctx context.Context) (map[string]*index.Document, error) { + f.docs = make(map[string]*index.Document) + f.ids = make(map[string]int) + + if err := f.documents(ctx); err != nil { + return nil, err + } + if err := f.tags(ctx); err != nil { + return nil, err + } + if err := f.links(ctx); err != nil { + return nil, err + } + if err := f.authors(ctx); err != nil { + return nil, err + } + + return f.docs, nil +} + +func (f *Fill) document(ctx context.Context) error { + var title sql.NullString + var dateEpoch sql.NullInt64 + var fileTimeEpoch sql.NullInt64 + var meta sql.NullString + + row := f.Db.QueryRowContext(ctx, ` + SELECT id, title, date, fileTime, meta + FROM Documents + WHERE path = ? + `, f.Path) + if err := row.Scan(&f.id, &title, &dateEpoch, &fileTimeEpoch, &meta); err != nil { + return err + } + + if title.Valid { + f.doc.Title = title.String + } + if dateEpoch.Valid { + f.doc.Date = time.Unix(dateEpoch.Int64, 0) + } + if fileTimeEpoch.Valid { + f.doc.FileTime = time.Unix(fileTimeEpoch.Int64, 0) + } + if meta.Valid { + f.doc.OtherMeta = meta.String + } + return nil +} + +func (f *FillMany) documents(ctx context.Context) error { + rows, err := f.Db.QueryContext(ctx, ` + SELECT id, path, title, date, fileTime, meta + FROM Documents + `) + if err != nil { + return err + } + defer rows.Close() + + var id int + var docPath string + var title, meta sql.NullString + var dateEpoch, filetimeEpoch sql.NullInt64 + + for rows.Next() { + if err := rows.Scan(&id, &docPath, &title, &dateEpoch, &filetimeEpoch, &meta); err != nil { + return err + } + + doc := &index.Document{ + Path: docPath, + } + + if title.Valid { + doc.Title = title.String + } + if dateEpoch.Valid { + doc.Date = time.Unix(dateEpoch.Int64, 0) + } + if filetimeEpoch.Valid { + doc.FileTime = time.Unix(filetimeEpoch.Int64, 0) + } + if meta.Valid { + doc.OtherMeta = meta.String + } + + f.docs[docPath] = doc + f.ids[docPath] = id + } + + return nil +} + +func (f Fill) authors(ctx context.Context) error { + rows, err := f.Db.QueryContext(ctx, ` + SELECT name + FROM Authors + JOIN DocumentAuthors + ON Authors.id = DocumentAuthors.authorId + WHERE docId = ? + `, f.id) + if err != nil { + return err + } + defer rows.Close() + + var name string + authors := make([]string, 0, 4) + for rows.Next() { + if err := rows.Scan(&name); err != nil { + return err + } + authors = append(authors, name) + } + + f.doc.Authors = authors + + return nil +} + +func (f FillMany) authors(ctx context.Context) error { + stmt, err := f.Db.PrepareContext(ctx, ` + SELECT name + FROM Authors + JOIN DocumentAuthors + ON Authors.id = DocumentAuthors.authorId + WHERE docId = ? + `) + if err != nil { + return err + } + defer stmt.Close() + + // PERF: parallelize + var name string + for path, id := range f.ids { + rows, err := stmt.QueryContext(ctx, id) + if err != nil { + return err + } + + doc := f.docs[path] + for rows.Next() { + if err := rows.Scan(&name); err != nil { + rows.Close() + return err + } + + doc.Authors = append(doc.Authors, name) + } + + rows.Close() + } + + return nil +} + +func (f Fill) tags(ctx context.Context) error { + rows, err := f.Db.QueryContext(ctx, ` + SELECT name + FROM Tags + JOIN DocumentTags + ON Tags.id = DocumentTags.tagId + WHERE docId = ? + `, f.id) + if err != nil { + panic(err) + } + defer rows.Close() + + var tag string + tags := make([]string, 0, 2) + for rows.Next() { + if err := rows.Scan(&tag); err != nil { + return err + } + tags = append(tags, tag) + } + + f.doc.Tags = tags + + return nil +} + +func (f FillMany) tags(ctx context.Context) error { + stmt, err := f.Db.PrepareContext(ctx, ` + SELECT name + FROM Tags + JOIN DocumentTags + ON Tags.id = DocumentTags.tagId + WHERE docId = ? + `) + if err != nil { + return err + } + defer stmt.Close() + + // PERF: parallelize + var tag string + for docPath, id := range f.ids { + rows, err := stmt.QueryContext(ctx, id) + if err != nil { + return err + } + + doc := f.docs[docPath] + for rows.Next() { + if err := rows.Scan(&tag); err != nil { + rows.Close() + return err + } + + doc.Tags = append(doc.Tags, tag) + } + + rows.Close() + } + + return nil +} + +func (f Fill) links(ctx context.Context) error { + rows, err := f.Db.QueryContext(ctx, ` + SELECT path + FROM Documents + JOIN Links + ON Links.referencedId = Documents.id + WHERE Links.refererId = ? + `, f.id) + if err != nil { + return err + } + defer rows.Close() + + var link string + links := make([]string, 0) + for rows.Next() { + if err := rows.Scan(&link); err != nil { + return err + } + links = append(links, link) + } + f.doc.Links = links + + return nil +} + +func (f FillMany) links(ctx context.Context) error { + stmt, err := f.Db.PrepareContext(ctx, ` + SELECT path + FROM Documents + JOIN Links + ON Links.referencedId = Documents.id + WHERE Links.refererId = ? + `) + if err != nil { + return err + } + defer stmt.Close() + + // PERF: parallelize + var linkPath string + for path, id := range f.ids { + rows, err := stmt.QueryContext(ctx, id) + if err != nil { + return err + } + + doc := f.docs[path] + for rows.Next() { + if err := rows.Scan(&linkPath); err != nil { + rows.Close() + return err + } + doc.Links = append(doc.Links, linkPath) + } + + rows.Close() + } + + return nil +} diff --git a/pkg/data/get_test.go b/pkg/data/get_test.go new file mode 100644 index 0000000..14d6920 --- /dev/null +++ b/pkg/data/get_test.go @@ -0,0 +1,217 @@ +package data_test + +import ( + "context" + "database/sql" + "errors" + "testing" + "time" + + "github.com/jpappel/atlas/pkg/data" + "github.com/jpappel/atlas/pkg/index" +) + +func singleDoc(t *testing.T) *sql.DB { + t.Helper() + db := data.NewMemDB() + + if _, err := db.Exec(` + INSERT INTO Documents (path, title, date, fileTime) + VALUES ("/file", "A file", 1, 2) + `); err != nil { + t.Fatal("err inserting doc:", err) + } + + if _, err := db.Exec(` + INSERT INTO Authors (name) + VALUES ("jp") + `); err != nil { + t.Fatal("err inserting author:", err) + } + + if _, err := db.Exec(` + INSERT INTO Aliases (authorId, alias) + VALUES (1,"pj"), (1,"JP") + `); err != nil { + t.Fatal("err inserting aliases:", err) + } + + if _, err := db.Exec(` + INSERT INTO Tags (name) + VALUES ("foo"), ("bar"), ("baz"), ("oof") + `); err != nil { + t.Fatal("err inserting tags:", err) + } + + if _, err := db.Exec(` + INSERT INTO DocumentAuthors (docId, authorId) + VALUES (1, 1) + `); err != nil { + t.Fatal("err inserting docAuthors:", err) + } + + if _, err := db.Exec(` + INSERT INTO DocumentTags (docId, tagId) + VALUES (1,1), (1,2), (1,3), (1,4) + `); err != nil { + t.Fatal("err inserting docTags:", err) + } + + return db +} + +func multiDoc(t *testing.T) *sql.DB { + t.Helper() + db := data.NewMemDB() + + if _, err := db.Exec(` + INSERT INTO Documents (path, title, date, fileTime) + VALUES ("/notes/anote.md", "A note", 1, 2), ("README.md", "read this file!", 3, 4) + `); err != nil { + t.Fatal("err inserting doc:", err) + } + + if _, err := db.Exec(` + INSERT INTO Authors (name) + VALUES ("jp"), ("anonymous") + `); err != nil { + t.Fatal("err inserting author:", err) + } + + if _, err := db.Exec(` + INSERT INTO Aliases (authorId, alias) + VALUES (1,"pj"), (1,"JP") + `); err != nil { + t.Fatal("err inserting aliases:", err) + } + + if _, err := db.Exec(` + INSERT INTO Tags (name) + VALUES ("foo"), ("bar"), ("baz"), ("oof") + `); err != nil { + t.Fatal("err inserting tags:", err) + } + + if _, err := db.Exec(` + INSERT INTO DocumentAuthors (docId, authorId) + VALUES (1, 1), (2, 2), (2, 1) + `); err != nil { + t.Fatal("err inserting docAuthors:", err) + } + + if _, err := db.Exec(` + INSERT INTO DocumentTags (docId, tagId) + VALUES (1,1), (2,2), (1,3), (2,4) + `); err != nil { + t.Fatal("err inserting docTags:", err) + } + + return db +} + +func TestFill_Get(t *testing.T) { + tests := []struct { + name string + newFill func(t *testing.T) data.Fill + want index.Document + wantErr error + }{ + { + "single doc", + func(t *testing.T) data.Fill { + t.Helper() + return data.Fill{Path: "/file", Db: singleDoc(t)} + }, + index.Document{ + Path: "/file", + Title: "A file", + Date: time.Unix(1, 0), + FileTime: time.Unix(2, 0), + Authors: []string{"jp"}, + Tags: []string{"foo", "bar", "oof", "baz"}, + }, + nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + f := tt.newFill(t) + got, gotErr := f.Get(context.Background()) + + if !errors.Is(gotErr, tt.wantErr) { + t.Fatalf("Recieved unexpected error: got %v want %v", gotErr, tt.wantErr) + } else if gotErr != nil { + return + } + + if !got.Equal(tt.want) { + t.Errorf("Get() = %+v\nWant %+v", got, tt.want) + } + }) + } +} + +func TestFillMany_Get(t *testing.T) { + tests := []struct { + name string + newFillMany func(t *testing.T) data.FillMany + want map[string]*index.Document + wantErr error + }{ + { + "multi doc", + func(t *testing.T) data.FillMany { + t.Helper() + return data.FillMany{Db: multiDoc(t)} + }, + map[string]*index.Document{ + "/notes/anote.md": { + Path: "/notes/anote.md", + Title: "A note", + Date: time.Unix(1, 0), + FileTime: time.Unix(2, 0), + Authors: []string{"jp"}, + Tags: []string{"foo", "baz"}, + }, + "README.md": { + Path: "README.md", + Title: "read this file!", + Date: time.Unix(3, 0), + FileTime: time.Unix(4, 0), + Authors: []string{"anonymous", "jp"}, + Tags: []string{"bar", "oof"}, + }, + }, + nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + f := tt.newFillMany(t) + + got, gotErr := f.Get(ctx) + if !errors.Is(gotErr, tt.wantErr) { + t.Fatalf("Recieved unexpected error: got %v want %v", gotErr, tt.wantErr) + } else if gotErr != nil { + return + } + + if len(tt.want) != len(got) { + t.Errorf("Recieved incorrect amount of documents: got %d want %d", len(got), len(tt.want)) + } + + for path, wantDoc := range tt.want { + gotDoc, ok := got[path] + if !ok { + t.Errorf("Can't find %s in recieved docs", path) + continue + } + + if !gotDoc.Equal(*wantDoc) { + t.Errorf("%s not equal %+v\nWant %+v", path, gotDoc, wantDoc) + } + } + }) + } +} diff --git a/pkg/data/put.go b/pkg/data/put.go new file mode 100644 index 0000000..f3fe4b6 --- /dev/null +++ b/pkg/data/put.go @@ -0,0 +1,384 @@ +package data + +import ( + "context" + "database/sql" + "fmt" + + "github.com/jpappel/atlas/pkg/index" +) + +// TODO: rename struct +type Put struct { + Id int64 + Doc index.Document + tx *sql.Tx +} + +// TODO: rename struct +type PutMany struct { + Docs map[int64]*index.Document + pathDocs map[string]*index.Document + db *sql.DB +} + +func NewPut(ctx context.Context, db *sql.DB, doc index.Document) (Put, error) { + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return Put{}, nil + } + p := Put{Doc: doc, tx: tx} + return p, nil +} + +func NewPutMany(db *sql.DB, documents map[string]*index.Document) (PutMany, error) { + docs := make(map[int64]*index.Document, len(documents)) + p := PutMany{ + Docs: docs, + pathDocs: documents, + db: db, + } + return p, nil +} + +func (p Put) Insert() error { + if err := p.document(); err != nil { + p.tx.Rollback() + return err + } + + if err := p.tags(); err != nil { + p.tx.Rollback() + return err + } + + if err := p.links(); err != nil { + p.tx.Rollback() + return err + } + + if err := p.authors(); err != nil { + p.tx.Rollback() + return err + } + + return p.tx.Commit() +} + +func (p PutMany) Insert(ctx context.Context) error { + if err := p.documents(ctx); err != nil { + return err + } + + if err := p.tags(ctx); err != nil { + return err + } + + if err := p.links(ctx); err != nil { + return err + } + + if err := p.authors(ctx); err != nil { + return err + } + + return nil +} + +func (p *Put) document() error { + title := sql.NullString{String: p.Doc.Title, Valid: p.Doc.Title != ""} + + dateUnix := p.Doc.Date.Unix() + date := sql.NullInt64{Int64: dateUnix, Valid: dateUnix != 0} + + filetimeUnix := p.Doc.FileTime.Unix() + filetime := sql.NullInt64{Int64: filetimeUnix, Valid: filetimeUnix != 0} + + meta := sql.NullString{String: p.Doc.OtherMeta, Valid: p.Doc.OtherMeta != ""} + + result, err := p.tx.Exec(` + INSERT INTO Documents(path, title, date, fileTime, meta) + VALUES (?,?,?,?,?) + `, p.Doc.Path, title, date, filetime, meta) + if err != nil { + return err + } + + p.Id, err = result.LastInsertId() + if err != nil { + return err + } + + return nil +} + +func (p *PutMany) documents(ctx context.Context) error { + stmt, err := p.db.PrepareContext(ctx, ` + INSERT INTO Documents(path, title, date, fileTime, meta) + VALUES (?,?,?,?,?) + `) + if err != nil { + return err + } + defer stmt.Close() + + tx, err := p.db.BeginTx(ctx, nil) + if err != nil { + return err + } + + txStmt := tx.StmtContext(ctx, stmt) + + // PERF: profile this, grabbing the docId here might save time by simpliyfying + // future inserts + for _, doc := range p.pathDocs { + title := sql.NullString{String: doc.Title, Valid: doc.Title != ""} + dateUnix := doc.Date.Unix() + date := sql.NullInt64{Int64: dateUnix, Valid: dateUnix != 0} + + filetimeUnix := doc.FileTime.Unix() + filetime := sql.NullInt64{Int64: filetimeUnix, Valid: filetimeUnix != 0} + + meta := sql.NullString{String: doc.OtherMeta, Valid: doc.OtherMeta != ""} + + res, err := txStmt.ExecContext(ctx, doc.Path, title, date, filetime, meta) + if err != nil { + tx.Rollback() + return err + } + + id, err := res.LastInsertId() + if err != nil { + tx.Rollback() + return err + } + + p.Docs[id] = doc + } + + return tx.Commit() +} + +func (p Put) tags() error { + query, args := BatchQuery("INSERT OR IGNORE INTO Tags (name) VALUES", "", "(?)", ",", "", len(p.Doc.Tags), p.Doc.Tags) + if _, err := p.tx.Exec(query, args...); err != nil { + return err + } + + preQuery := fmt.Sprintf(` + INSERT INTO DocumentTags + SELECT %d, Tags.id + FROM Tags + WHERE name IN + `, p.Id) + + query, args = BatchQuery(preQuery, "(", "?", ",", ")", len(p.Doc.Tags), p.Doc.Tags) + if _, err := p.tx.Exec(query, args...); err != nil { + return err + } + + return nil +} + +func (p PutMany) tags(ctx context.Context) error { + newTagStmt, err := p.db.PrepareContext(ctx, "INSERT OR IGNORE INTO Tags (name) VALUES (?)") + if err != nil { + return err + } + defer newTagStmt.Close() + + tx, err := p.db.BeginTx(ctx, nil) + if err != nil { + return err + } + + txNewTagStmt := tx.StmtContext(ctx, newTagStmt) + + for id, doc := range p.Docs { + for _, tag := range doc.Tags { + if _, err := txNewTagStmt.ExecContext(ctx, tag); err != nil { + tx.Rollback() + return err + } + } + + preQuery := fmt.Sprintf(` + INSERT INTO DocumentTags (docId, tagId) + SELECT %d, Tags.id + FROM Tags + WHERE name IN + `, id) + query, args := BatchQuery(preQuery, "(", "?", ",", ")", len(doc.Tags), doc.Tags) + if _, err := tx.Exec(query, args); err != nil { + tx.Rollback() + return err + } + } + + return tx.Commit() +} + +func (p Put) links() error { + if len(p.Doc.Links) == 0 { + return nil + } + + preQuery := fmt.Sprintf(` + INSERT INTO Links (referencedId, refererId) + SELECT id, %d + FROM Documents + WHERE path IN + `, p.Id) + query, args := BatchQuery(preQuery, "(", "?", ",", ")", len(p.Doc.Links), p.Doc.Links) + if _, err := p.tx.Exec(query, args...); err != nil { + return err + } + + return nil +} + +func (p PutMany) links(ctx context.Context) error { + tx, err := p.db.BeginTx(ctx, nil) + if err != nil { + return err + } + + for id, doc := range p.Docs { + preQuery := fmt.Sprintf(` + INSERT INTO Links (referencedId, refererId) + SELECT id, %d + FROM Documents + WHERE path IN + `, id) + query, args := BatchQuery(preQuery, "(", "?", ",", ")", len(doc.Links), doc.Links) + if _, err := tx.Exec(query, args...); err != nil { + tx.Rollback() + return err + } + } + + return tx.Commit() +} + +func (p Put) authors() error { + // TODO: consider using temp table instead of cte + namesCTE, args := BatchQuery("WITH names(n) AS", + "( VALUES ", "(?)", ",", "),", len(p.Doc.Authors), p.Doc.Authors) + + newAuthorsQuery := namesCTE + ` + filtered_names AS ( + SELECT n + FROM names + LEFT JOIN Authors on Authors.name = n + LEFT JOIN Aliases on Aliases.alias = n + WHERE Authors.name IS NULL AND Aliases.alias IS NULL + ) + INSERT INTO Authors(name) + SELECT n FROM filtered_names + ` + if _, err := p.tx.Exec(newAuthorsQuery, args...); err != nil { + return err + } + + docAuthorsQuery := namesCTE + fmt.Sprintf(` + matched_authors AS ( + SELECT Authors.id AS author_id + FROM Authors + LEFT JOIN Aliases + ON Authors.id = Aliases.authorId + JOIN names + ON Authors.name = n OR Aliases.alias = n + ) + INSERT INTO DocumentAuthors(docId, authorId) + SELECT %d, author_id FROM matched_authors + `, p.Id) + if _, err := p.tx.Exec(docAuthorsQuery, args...); err != nil { + return err + } + + return nil +} + +func (p PutMany) authors(ctx context.Context) error { + tx, err := p.db.BeginTx(ctx, nil) + if err != nil { + return err + } + + _, err = p.db.Exec("CREATE TEMPORARY TABLE names (name TEXT UNIQUE NOT NULL)") + // _, err = tx.Exec("CREATE TEMPORARY TABLE names (name TEXT UNIQUE NOT NULL)") + if err != nil { + tx.Rollback() + return err + } + defer p.db.Exec("DROP TABLE IF EXISTS temp.names") + + nameStmt, err := p.db.PrepareContext(ctx, "INSERT OR IGNORE INTO temp.names VALUES (?)") + if err != nil { + return err + } + defer nameStmt.Close() + + txNameStmt := tx.StmtContext(ctx, nameStmt) + for _, doc := range p.Docs { + for _, name := range doc.Authors { + if _, err := txNameStmt.Exec(name); err != nil { + tx.Rollback() + return err + } + } + } + + newAuthorsQuery := ` + WITH new_names AS ( + SELECT name + FROM temp.names + LEFT JOIN Authors on Authors.name = temp.names.name + LEFT JOIN Aliases on Aliases.alias = tmep.names.name + WHERE Authors.name IS NULL AND Aliases.alias IS NULL + ) + INSERT INTO Authors(name) + SELECT name FROM new_names + ` + + if _, err := tx.Exec(newAuthorsQuery); err != nil { + tx.Rollback() + return err + } + + _, err = tx.Exec(` + CREATE TEMPORARY TABLE name_ids AS + SELECT names.name AS name, COALESCE(Authors.id, Aliases.authorId) AS authorId + FROM names + LEFT JOIN Authors ON temp.names.name = Authors.name + LEFT JOIN Aliases ON temp.names.name = Aliases.name + `) + if err != nil { + tx.Rollback() + return err + } + defer p.db.Exec("DROP TABLE IF EXISTS temp.name_ids") + + docAuthorsStmt, err := p.db.Prepare(` + INSERT INTO DocumentAuthors (docId, authorId) + SELECT ?, authorId + FROM temp.name_ids + WHERE name = ? + `) + if err != nil { + tx.Rollback() + return err + } + defer docAuthorsStmt.Close() + + for id, doc := range p.Docs { + for _, name := range doc.Authors { + if _, err := tx.Stmt(docAuthorsStmt).Exec(id, name); err != nil { + tx.Rollback() + return err + } + } + } + + return tx.Commit() +} diff --git a/pkg/data/put_test.go b/pkg/data/put_test.go new file mode 100644 index 0000000..7e5ad38 --- /dev/null +++ b/pkg/data/put_test.go @@ -0,0 +1,98 @@ +package data_test + +import ( + "context" + "database/sql" + "errors" + "testing" + "time" + + "github.com/jpappel/atlas/pkg/data" + "github.com/jpappel/atlas/pkg/index" +) + +func TestPut_Insert(t *testing.T) { + tests := []struct { + name string + newDb func(t *testing.T) *sql.DB + doc index.Document + wantErr error + }{ + { + "insert on empty", + func(t *testing.T) *sql.DB { + t.Helper() + return data.NewMemDB() + }, + index.Document{ + Path: "/file", + Title: "A file", + Date: time.Unix(1, 0), + FileTime: time.Unix(2, 0), + Authors: []string{"jp"}, + Tags: []string{"foo", "bar", "oof", "baz"}, + }, + nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + db := tt.newDb(t) + defer db.Close() + + p, err := data.NewPut(ctx, db, tt.doc) + if err != nil { + t.Fatalf("could not construct receiver type: %v", err) + } + + gotErr := p.Insert() + if !errors.Is(gotErr, tt.wantErr) { + t.Fatalf("Unexpected error on Insert():, want %v got %v", tt.wantErr, gotErr) + } else if gotErr != nil { + return + } + + f := data.Fill{Path: tt.doc.Path, Db: db} + gotDoc, err := f.Get(ctx) + if err != nil { + t.Fatal("Error while retrieving document for comparison:", err) + } + + if !gotDoc.Equal(tt.doc) { + t.Errorf("Retrieved doc is not stored doc!\nrecv: %+v\nsent: %+v", gotDoc, tt.doc) + } + }) + } +} + +func TestPutMany_Insert(t *testing.T) { + tests := []struct { + name string // description of this test case + // Named input parameters for receiver constructor. + db *sql.DB + documents map[string]*index.Document + wantErr bool + }{ + // TODO: Add test cases. + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p, err := data.NewPutMany(tt.db, tt.documents) + if err != nil { + t.Fatalf("could not construct receiver type: %v", err) + } + gotErr := p.Insert(context.Background()) + if gotErr != nil { + if !tt.wantErr { + t.Errorf("Insert() failed: %v", gotErr) + } + return + } + if tt.wantErr { + t.Fatal("Insert() succeeded unexpectedly") + } + }) + } +} + diff --git a/pkg/index/filters.go b/pkg/index/filters.go new file mode 100644 index 0000000..f59a5d6 --- /dev/null +++ b/pkg/index/filters.go @@ -0,0 +1,84 @@ +package index + +import ( + "io" + "path/filepath" +) + +// TODO: create excluded path filter factory + +type DocFilter func(infoPath, io.ReadSeeker) bool + +func NewExtensionFilter(ext string) DocFilter { + return func(ip infoPath, _ io.ReadSeeker) bool { + return filepath.Ext(ip.path) == ext + } +} + +func NewMaxFilesizeFilter(size int64) DocFilter { + return func(ip infoPath, _ io.ReadSeeker) bool { + return ip.info.Size() <= size + } +} + +func YamlHeaderFilter(_ infoPath, r io.ReadSeeker) bool { + const bufSize = 4096 + buf := make([]byte, bufSize) + + carry := make([]byte, 4) + cmp := make([]byte, 4) + n, err := r.Read(carry) + if err != nil || n < 4 || string(carry) != "---\n" { + return false + } + + headerFound := false + readMore := true + for readMore { + buf = buf[:bufSize] + n, err := r.Read(buf) + if err == io.EOF { + readMore = false + } else if err != nil { + return false + } + buf = buf[:n] + + // PERF: the carry doesn't need to be checked on the first loop iteration + for i := range min(4, n) { + b := carry[i] + for j := range 4 { + if i+j < 4 { + cmp[j] = carry[i+j] + } else { + cmp[j] = buf[(i+j)%4] + } + } + if b == '\n' && string(cmp) == "\n---\n" { + headerFound = true + readMore = false + break + } + } + for i := range n - 4 { + b := buf[i] + if b == '\n' && string(buf[i:i+5]) == "\n---\n" { + headerFound = true + readMore = false + break + } + } + + if readMore { + for i := range 4 { + carry[i] = buf[n-4+i] + } + } + } + + return headerFound +} + +func DefaultFilters() []DocFilter { + return []DocFilter{NewExtensionFilter(".md"), NewMaxFilesizeFilter(200 * 1024)} +} diff --git a/pkg/index/filters_test.go b/pkg/index/filters_test.go new file mode 100644 index 0000000..897a82f --- /dev/null +++ b/pkg/index/filters_test.go @@ -0,0 +1,112 @@ +package index + +import ( + "bytes" + "io" + "os" + "testing" +) + +func noYamlHeader() io.ReadSeeker { + buf := []byte("just some text") + return bytes.NewReader(buf) +} + +func incompleteYamlHeader() io.ReadSeeker { + buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---") + return bytes.NewReader(buf) +} + +func completeYamlHeader() io.ReadSeeker { + buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\n") + return bytes.NewReader(buf) +} + +func trailingYamlHeader() io.ReadSeeker { + buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\nhere are some content\nanother line of text") + return bytes.NewReader(buf) +} + +func extensionless(t *testing.T) infoPath { + root := t.TempDir() + path := root + "/" + "afile" + f, err := os.Create(path) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + if _, err := f.WriteString("this is a file"); err != nil { + t.Fatal(err) + } + + info, err := f.Stat() + if err != nil { + t.Fatal(err) + } + + return infoPath{path, info} +} + +func markdownExtension(t *testing.T) infoPath { + root := t.TempDir() + path := root + "/" + "a.md" + f, err := os.Create(path) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + info, err := f.Stat() + if err != nil { + t.Fatal(err) + } + + return infoPath{path, info} +} + +func TestYamlHeaderFilter(t *testing.T) { + tests := []struct { + name string + r io.ReadSeeker + want bool + }{ + {"completeYamlHeader", completeYamlHeader(), true}, + {"trailingYamlHeader", trailingYamlHeader(), true}, + {"noYamlHeader", noYamlHeader(), false}, + {"incompleteYamlHeader", incompleteYamlHeader(), false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := YamlHeaderFilter(infoPath{}, tt.r) + if got != tt.want { + t.Errorf("YamlHeaderFilter() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestExtensionFilter(t *testing.T) { + tests := []struct { + name string + infoGen func(*testing.T) infoPath + ext string + want bool + }{ + {"no extension, accept .md", extensionless, ".md", false}, + {"no extension, accept all", extensionless, "", true}, + {"markdown, accept .md", markdownExtension, ".md", true}, + {"makdown, accept .png", markdownExtension, ".png", false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + filter := NewExtensionFilter(tt.ext) + ip := tt.infoGen(t) + got := filter(ip, nil) + + if got != tt.want { + t.Errorf("ExtensionFilter() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/index/index.go b/pkg/index/index.go new file mode 100644 index 0000000..074b659 --- /dev/null +++ b/pkg/index/index.go @@ -0,0 +1,275 @@ +package index + +import ( + "errors" + "fmt" + "io" + "os" + "slices" + "sync" + "time" + + "github.com/goccy/go-yaml" +) + +type Document struct { + Path string + Title string `yaml:"title"` + Date time.Time `yaml:"date"` + FileTime time.Time + Authors []string `yaml:"authors"` + Tags []string `yaml:"tags"` + Links []string + OtherMeta string // unsure about how to handle this +} + +type infoPath struct { + path string + info os.FileInfo +} + +type Index struct { + Root string // root directory for searching + Documents map[string]*Document + Filters []DocFilter +} + +func (idx Index) String() string { + // TODO: print info about active filters + return fmt.Sprintf("%s Documents[%d] Filters[%d]", idx.Root, len(idx.Documents), len(idx.Filters)) +} + +func (doc Document) Equal(other Document) bool { + if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) || !doc.FileTime.Equal(other.FileTime) { + return false + } + + if !slices.Equal(doc.Authors, other.Authors) { + return false + } + + slices.Sort(doc.Tags) + slices.Sort(other.Tags) + for i := range doc.Tags { + if doc.Tags[i] != other.Tags[i] { + return false + } + } + + slices.Sort(doc.Links) + slices.Sort(other.Links) + for i := range doc.Links { + if doc.Links[i] != other.Links[i] { + return false + } + } + + return true +} + +func visit(file infoPath, visitQueue chan<- infoPath, filterQueue chan<- infoPath, wg *sync.WaitGroup) { + // TODO: check if symlink, and handle appropriately + // TODO: extract error out of function + + if file.info.IsDir() { + entries, err := os.ReadDir(file.path) + if err != nil { + panic(err) + } + + wg.Add(len(entries)) + for _, entry := range entries { + entryInfo, err := entry.Info() + if err != nil { + panic(err) + } + // PERF: prevents deadlock but introduces an additional goroutine overhead per file + go func(path string) { + visitQueue <- infoPath{path: path, info: entryInfo} + }(file.path + "/" + entry.Name()) + } + } else if file.info.Mode().IsRegular() { + filterQueue <- file + } + + wg.Done() +} + +func workerTraverse(wg *sync.WaitGroup, visitQueue chan infoPath, filterQueue chan<- infoPath) { + for work := range visitQueue { + visit(work, visitQueue, filterQueue, wg) + } +} + +func (idx Index) Traverse(numWorkers uint) []string { + if numWorkers <= 1 { + panic(fmt.Sprint("Invalid number of workers: ", numWorkers)) + } + docs := make([]string, 0) + + rootInfo, err := os.Stat(idx.Root) + if err != nil { + panic(err) + } + + jobs := make(chan infoPath, numWorkers) + filterQueue := make(chan infoPath, numWorkers) + + activeJobs := &sync.WaitGroup{} + + // start workers + for range numWorkers { + go workerTraverse(activeJobs, jobs, filterQueue) + } + + // init send + activeJobs.Add(1) + jobs <- infoPath{path: idx.Root, info: rootInfo} + + // TODO: close jobs queue + go func() { + activeJobs.Wait() + close(jobs) + close(filterQueue) + }() + + // gather + for doc := range filterQueue { + docs = append(docs, doc.path) + } + + return docs +} + +func (idx Index) FilterOne(path string) bool { + info, err := os.Stat(string(path)) + if err != nil { + return false + } + + f, err := os.Open(string(path)) + if err != nil { + return false + } + defer f.Close() + + for _, filter := range idx.Filters { + if !filter(infoPath{string(path), info}, f) { + return false + } + if _, err := f.Seek(0, io.SeekStart); err != nil { + return false + } + } + return true +} + +func (idx Index) Filter(paths []string, numWorkers uint) []string { + fPaths := make([]string, 0, len(paths)) + jobs := make(chan string, numWorkers) + accepted := make(chan string, numWorkers) + wg := &sync.WaitGroup{} + + wg.Add(int(numWorkers)) + for range numWorkers { + go func(jobs <-chan string, accepted chan<- string, wg *sync.WaitGroup) { + for path := range jobs { + if idx.FilterOne(path) { + accepted <- path + } + } + wg.Done() + }(jobs, accepted, wg) + } + + go func(jobs chan<- string) { + for _, path := range paths { + jobs <- path + } + close(jobs) + }(jobs) + + go func() { + wg.Wait() + close(accepted) + }() + + for path := range accepted { + fPaths = append(fPaths, path) + } + + return fPaths +} + +func (idx Index) ParseOne(path string) (*Document, error) { + doc := &Document{} + doc.Path = path + + f, err := os.Open(string(path)) + if err != nil { + return nil, err + } + + info, err := f.Stat() + if err != nil { + return nil, err + } + doc.FileTime = info.ModTime() + + buf := make([]byte, 4, 1024) + n, err := f.Read(buf) + if err != nil { + return nil, err + } else if n != 4 { + return nil, errors.New("Short read") + } + + // TODO: implement custom unmarshaller, for singular `Author` + dec := yaml.NewDecoder(f) + // TODO: handle no yaml header error + if err := dec.Decode(&doc); err != nil { + panic(err) + } + + // TODO: body parsing + + return doc, nil +} + +func (idx Index) Parse(paths []string, numWorkers uint) { + jobs := make(chan string, numWorkers) + results := make(chan Document, numWorkers) + wg := &sync.WaitGroup{} + + wg.Add(int(numWorkers)) + for range numWorkers { + go func(jobs <-chan string, results chan<- Document, wg *sync.WaitGroup) { + for path := range jobs { + doc, err := idx.ParseOne(path) + if err != nil { + // TODO: propagate error + panic(err) + } + + results <- *doc + } + wg.Done() + }(jobs, results, wg) + } + + go func(jobs chan<- string, paths []string) { + for _, path := range paths { + jobs <- path + } + close(jobs) + }(jobs, paths) + + go func(results chan Document, wg *sync.WaitGroup) { + wg.Wait() + close(results) + }(results, wg) + + for doc := range results { + idx.Documents[doc.Path] = &doc + } +} diff --git a/pkg/index/index_test.go b/pkg/index/index_test.go new file mode 100644 index 0000000..0b5d2f2 --- /dev/null +++ b/pkg/index/index_test.go @@ -0,0 +1,138 @@ +package index + +import ( + "fmt" + "os" + "slices" + "testing" +) + +var indexCases map[string]func(t *testing.T) Index + +func init() { + indexCases = make(map[string]func(t *testing.T) Index) + + indexCases["single file"] = func(t *testing.T) Index { + root := t.TempDir() + index := Index{Root: root, Filters: []DocFilter{NewExtensionFilter(".md")}} + + f, err := os.Create(root + "/a_file.md") + if err != nil { + t.Fatal(err) + } + f.WriteString("some file contents\n") + + return index + } + + indexCases["large file"] = func(t *testing.T) Index { + root := t.TempDir() + index := Index{Root: root} + + return index + } + + indexCases["worker saturation"] = func(t *testing.T) Index { + root := t.TempDir() + index := Index{Root: root} + + permission := os.FileMode(0o777) + for _, dirName := range []string{"a", "b", "c", "d", "e", "f"} { + dir := root + "/" + dirName + if err := os.Mkdir(dir, permission); err != nil { + t.Fatal(err) + } + for i := range 8 { + fName := fmt.Sprint(dirName, i) + f, err := os.Create(dir + "/" + fName) + if err != nil { + t.Fatal(err) + } + f.WriteString(fName) + } + } + + return index + } +} + +func TestIndex_Traverse(t *testing.T) { + tests := []struct { + name string + indexCase func(t *testing.T) Index + numWorkers uint + want []string + }{ + {name: "single file", indexCase: indexCases["single file"], numWorkers: 2, want: []string{"a_file.md"}}, + {name: "saturation test", indexCase: indexCases["worker saturation"], numWorkers: 2, want: []string{ + "a/a0", "a/a1", "a/a2", "a/a3", "a/a4", "a/a5", "a/a6", "a/a7", + "b/b0", "b/b1", "b/b2", "b/b3", "b/b4", "b/b5", "b/b6", "b/b7", + "c/c0", "c/c1", "c/c2", "c/c3", "c/c4", "c/c5", "c/c6", "c/c7", + "d/d0", "d/d1", "d/d2", "d/d3", "d/d4", "d/d5", "d/d6", "d/d7", + "e/e0", "e/e1", "e/e2", "e/e3", "e/e4", "e/e5", "e/e6", "e/e7", + "f/f0", "f/f1", "f/f2", "f/f3", "f/f4", "f/f5", "f/f6", "f/f7", + }}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + idx := tt.indexCase(t) + got := idx.Traverse(tt.numWorkers) + + slices.Sort(got) + slices.Sort(tt.want) + + n := min(len(got), len(tt.want)) + if len(got) != len(tt.want) { + t.Errorf("Wanted %v got %v paths", len(tt.want), len(got)) + t.Logf("Checking up to %d values", n) + } + + for i := range n { + gotPath := got[i] + wantPath := idx.Root + "/" + tt.want[i] + if gotPath != wantPath { + t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath) + } + } + }) + } +} + +func TestIndex_Filter(t *testing.T) { + tests := []struct { + name string + paths []string + indexCase func(t *testing.T) Index + numWorkers uint + want []string + }{ + {"single file", []string{"a_file.md"}, indexCases["single file"], 2, []string{"a_file.md"}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + idx := tt.indexCase(t) + for i, path := range tt.paths { + tt.paths[i] = idx.Root + "/" + path + } + + got := idx.Filter(tt.paths, tt.numWorkers) + + slices.Sort(got) + slices.Sort(tt.want) + + n := min(len(got), len(tt.want)) + if len(got) != len(tt.want) { + t.Errorf("Wanted %v got %v paths", len(tt.want), len(got)) + t.Logf("Checking up to %d values", n) + } + + for i := range n { + gotPath := got[i] + wantPath := idx.Root + "/" + tt.want[i] + if gotPath != wantPath { + t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath) + } + } + }) + } +} diff --git a/pkg/index/schema.sql b/pkg/index/schema.sql new file mode 100644 index 0000000..fb06351 --- /dev/null +++ b/pkg/index/schema.sql @@ -0,0 +1,70 @@ +-- TABLE of config values +CREATE TABLE Indexes( + root TEXT NOT NULL, + followSym DATE +); + +-- Schema +CREATE TABLE Documents( + id INTEGER PRIMARY KEY, + path TEXT UNIQUE NOT NULL, + title TEXT, + date INT, + fileTime INT, + meta BLOB +); + +CREATE TABLE Authors( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL +); + +CREATE TABLE Aliases( + authorId INT NOT NULL, + alias TEXT UNIQUE NOT NULL, + FOREIGN KEY (authorId) REFERENCES Authors(id) +); + +CREATE TABLE Tags( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL, +); + +CREATE TABLE Links( + referencedId INT, + refererId INT, + FOREIGN KEY (referencedId) REFERENCES Documents(id), + FOREIGN KEY (refererId) REFERENCES Documents(id) +); + +CREATE TABLE DocumentAuthors( + docId INT NOT NULL, + authorId INT NOT NULL, + FOREIGN KEY (docId) REFERENCES Documents(id), + FOREIGN KEY (authorId) REFERENCES Authors(id) +); + +CREATE TABLE DocumentTags( + docId INT NOT NULL, + tagId INT NOT NULL, + FOREIGN KEY (docId) REFERENCES Documents(id), + FOREIGN KEY (tagId) REFERENCES Tags(id), + UNIQUE(docId, tagId) +); + +-- Indexes +CREATE INDEX idx_doc_dates +ON Documents (date); +CREATE INDEX idx_doc_titles +ON Documents (title); + +CREATE INDEX idx_author_name +ON Authors(name); + +CREATE INDEX idx_aliases_alias +ON Aliases(alias); +CREATE INDEX idx_aliases_authorId +ON Aliases(authorId); + +CREATE INDEX idx_doctags_tagid +ON DocumentTags (tagId); diff --git a/pkg/query/data_structures.go b/pkg/query/data_structures.go new file mode 100644 index 0000000..c48ecde --- /dev/null +++ b/pkg/query/data_structures.go @@ -0,0 +1,81 @@ +package query + +import "errors" + +// not threadsafe implementation of stack +type nodeStack struct { + buf []*Node +} + +func (s nodeStack) Push(n *Node) { + s.buf = append(s.buf, n) +} +func (s nodeStack) Pop() *Node { + last_index := len(s.buf) - 1 + n := s.buf[last_index] + s.buf = s.buf[:last_index] + return n +} +func (s nodeStack) Peek() *Node { + return s.buf[len(s.buf)-1] +} +func (s nodeStack) IsEmpty() bool { + return len(s.buf) == 0 +} + +type nodeQueue struct { + buf []*Node + head int + tail int +} + +func makeNodeQueue(initial *Node, cap int) nodeQueue { + if cap < 1 { + panic("Invalid nodeQueue Capacity") + } + + q := nodeQueue{ + buf: make([]*Node, 0, cap), + head: 0, + tail: 1, + } + q.buf[0] = initial + + return q +} + +func (q nodeQueue) Enqueue(n *Node) error { + + q.buf[q.tail] = n + new_tail := (q.tail + 1) % len(q.buf) + if new_tail == q.head { + return errors.New("Queue out of capacity") + } + + q.tail = new_tail + return nil +} +func (q nodeQueue) Dequeue() (*Node, error) { + if q.head == q.tail { + return nil, errors.New("Empty Queue") + } + + n := q.buf[q.head] + q.head = (q.head + 1) % len(q.buf) + return n, nil +} +func (q nodeQueue) PeekHead() (*Node, error) { + if q.head == q.tail { + return nil, errors.New("Empty queue") + } + return q.buf[q.head], nil +} +func (q nodeQueue) PeekTail() (*Node, error) { + if q.head == q.tail { + return nil, errors.New("Empty Queue") + } + return q.buf[q.tail-1], nil +} +func (q nodeQueue) IsEmpty() bool { + return q.head == q.tail +} diff --git a/pkg/query/lang.md b/pkg/query/lang.md new file mode 100644 index 0000000..a399cb8 --- /dev/null +++ b/pkg/query/lang.md @@ -0,0 +1,12 @@ +# Query Language Spec + +``` + := | + + := + := {strings} + := + + := "and" | "or" | "not" | "similar" +``` diff --git a/pkg/query/parser.go b/pkg/query/parser.go new file mode 100644 index 0000000..355b18c --- /dev/null +++ b/pkg/query/parser.go @@ -0,0 +1,19 @@ +package query + +type TokenType uint64 + +const ( + TOKEN_ERROR TokenType = iota + TOKEN_EOF + TOKEN_AND + TOKEN_OR + TOKEN_NOT + TOKEN_SIMILAR + TOKEN_STATEMENT + // TODO: consider adding regex token +) + +type Token struct { + Type TokenType + Content string +} diff --git a/pkg/query/query.go b/pkg/query/query.go new file mode 100644 index 0000000..b712370 --- /dev/null +++ b/pkg/query/query.go @@ -0,0 +1,50 @@ +package query + + +type Node struct { + Parent *Node + Children []*Node + Token +} + +type AST struct { + root Node + size uint64 +} + +// Walk an ast depth first +func (T AST) dfWalk() func() (*Node, bool) { + stack := nodeStack{make([]*Node, 0, T.size)} + stack.Push(&T.root) + + return func() (*Node, bool) { + n := stack.Pop() + for _, child := range n.Children { + stack.Push(child) + } + + if stack.IsEmpty() { + return n, false + } + return n, true + } +} + +// Walk an ast breadth first +func (T AST) bfWalk() func() (*Node, bool) { + queue := nodeQueue{} + queue.buf = make([]*Node, 0, T.size) + queue.Enqueue(&T.root) + + return func() (*Node, bool) { + n, err := queue.Dequeue() + if err != nil { + return nil, false + } + + for _, child := range n.Children { + queue.Enqueue(child) + } + return n, true + } +} -- cgit v1.2.3