diff options
| author | JP Appel <jeanpierre.appel01@gmail.com> | 2025-04-27 00:49:27 -0400 |
|---|---|---|
| committer | JP Appel <jeanpierre.appel01@gmail.com> | 2025-04-27 00:49:27 -0400 |
| commit | 34b8d8ff1f9d65c08a9156d72f08cf548183c6f4 (patch) | |
| tree | a00fa0410a7bcde125a37b50b3a4956c838fa569 /pkg/index | |
| parent | 42527fdb0aca0d30652bb3052b80ab75ab057572 (diff) | |
Large commit; many features
Diffstat (limited to 'pkg/index')
| -rw-r--r-- | pkg/index/filters.go | 84 | ||||
| -rw-r--r-- | pkg/index/filters_test.go | 112 | ||||
| -rw-r--r-- | pkg/index/index.go | 275 | ||||
| -rw-r--r-- | pkg/index/index_test.go | 138 | ||||
| -rw-r--r-- | pkg/index/schema.sql | 70 |
5 files changed, 679 insertions, 0 deletions
diff --git a/pkg/index/filters.go b/pkg/index/filters.go new file mode 100644 index 0000000..f59a5d6 --- /dev/null +++ b/pkg/index/filters.go @@ -0,0 +1,84 @@ +package index + +import ( + "io" + "path/filepath" +) + +// TODO: create excluded path filter factory + +type DocFilter func(infoPath, io.ReadSeeker) bool + +func NewExtensionFilter(ext string) DocFilter { + return func(ip infoPath, _ io.ReadSeeker) bool { + return filepath.Ext(ip.path) == ext + } +} + +func NewMaxFilesizeFilter(size int64) DocFilter { + return func(ip infoPath, _ io.ReadSeeker) bool { + return ip.info.Size() <= size + } +} + +func YamlHeaderFilter(_ infoPath, r io.ReadSeeker) bool { + const bufSize = 4096 + buf := make([]byte, bufSize) + + carry := make([]byte, 4) + cmp := make([]byte, 4) + n, err := r.Read(carry) + if err != nil || n < 4 || string(carry) != "---\n" { + return false + } + + headerFound := false + readMore := true + for readMore { + buf = buf[:bufSize] + n, err := r.Read(buf) + if err == io.EOF { + readMore = false + } else if err != nil { + return false + } + buf = buf[:n] + + // PERF: the carry doesn't need to be checked on the first loop iteration + for i := range min(4, n) { + b := carry[i] + for j := range 4 { + if i+j < 4 { + cmp[j] = carry[i+j] + } else { + cmp[j] = buf[(i+j)%4] + } + } + if b == '\n' && string(cmp) == "\n---\n" { + headerFound = true + readMore = false + break + } + } + for i := range n - 4 { + b := buf[i] + if b == '\n' && string(buf[i:i+5]) == "\n---\n" { + headerFound = true + readMore = false + break + } + } + + if readMore { + for i := range 4 { + carry[i] = buf[n-4+i] + } + } + } + + return headerFound +} + +func DefaultFilters() []DocFilter { + return []DocFilter{NewExtensionFilter(".md"), NewMaxFilesizeFilter(200 * 1024)} +} diff --git a/pkg/index/filters_test.go b/pkg/index/filters_test.go new file mode 100644 index 0000000..897a82f --- /dev/null +++ b/pkg/index/filters_test.go @@ -0,0 +1,112 @@ +package index + +import ( + "bytes" + "io" + "os" + "testing" +) + +func noYamlHeader() io.ReadSeeker { + buf := []byte("just some text") + return bytes.NewReader(buf) +} + +func incompleteYamlHeader() io.ReadSeeker { + buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---") + return bytes.NewReader(buf) +} + +func completeYamlHeader() io.ReadSeeker { + buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\n") + return bytes.NewReader(buf) +} + +func trailingYamlHeader() io.ReadSeeker { + buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\nhere are some content\nanother line of text") + return bytes.NewReader(buf) +} + +func extensionless(t *testing.T) infoPath { + root := t.TempDir() + path := root + "/" + "afile" + f, err := os.Create(path) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + if _, err := f.WriteString("this is a file"); err != nil { + t.Fatal(err) + } + + info, err := f.Stat() + if err != nil { + t.Fatal(err) + } + + return infoPath{path, info} +} + +func markdownExtension(t *testing.T) infoPath { + root := t.TempDir() + path := root + "/" + "a.md" + f, err := os.Create(path) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + info, err := f.Stat() + if err != nil { + t.Fatal(err) + } + + return infoPath{path, info} +} + +func TestYamlHeaderFilter(t *testing.T) { + tests := []struct { + name string + r io.ReadSeeker + want bool + }{ + {"completeYamlHeader", completeYamlHeader(), true}, + {"trailingYamlHeader", trailingYamlHeader(), true}, + {"noYamlHeader", noYamlHeader(), false}, + {"incompleteYamlHeader", incompleteYamlHeader(), false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := YamlHeaderFilter(infoPath{}, tt.r) + if got != tt.want { + t.Errorf("YamlHeaderFilter() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestExtensionFilter(t *testing.T) { + tests := []struct { + name string + infoGen func(*testing.T) infoPath + ext string + want bool + }{ + {"no extension, accept .md", extensionless, ".md", false}, + {"no extension, accept all", extensionless, "", true}, + {"markdown, accept .md", markdownExtension, ".md", true}, + {"makdown, accept .png", markdownExtension, ".png", false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + filter := NewExtensionFilter(tt.ext) + ip := tt.infoGen(t) + got := filter(ip, nil) + + if got != tt.want { + t.Errorf("ExtensionFilter() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/index/index.go b/pkg/index/index.go new file mode 100644 index 0000000..074b659 --- /dev/null +++ b/pkg/index/index.go @@ -0,0 +1,275 @@ +package index + +import ( + "errors" + "fmt" + "io" + "os" + "slices" + "sync" + "time" + + "github.com/goccy/go-yaml" +) + +type Document struct { + Path string + Title string `yaml:"title"` + Date time.Time `yaml:"date"` + FileTime time.Time + Authors []string `yaml:"authors"` + Tags []string `yaml:"tags"` + Links []string + OtherMeta string // unsure about how to handle this +} + +type infoPath struct { + path string + info os.FileInfo +} + +type Index struct { + Root string // root directory for searching + Documents map[string]*Document + Filters []DocFilter +} + +func (idx Index) String() string { + // TODO: print info about active filters + return fmt.Sprintf("%s Documents[%d] Filters[%d]", idx.Root, len(idx.Documents), len(idx.Filters)) +} + +func (doc Document) Equal(other Document) bool { + if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) || !doc.FileTime.Equal(other.FileTime) { + return false + } + + if !slices.Equal(doc.Authors, other.Authors) { + return false + } + + slices.Sort(doc.Tags) + slices.Sort(other.Tags) + for i := range doc.Tags { + if doc.Tags[i] != other.Tags[i] { + return false + } + } + + slices.Sort(doc.Links) + slices.Sort(other.Links) + for i := range doc.Links { + if doc.Links[i] != other.Links[i] { + return false + } + } + + return true +} + +func visit(file infoPath, visitQueue chan<- infoPath, filterQueue chan<- infoPath, wg *sync.WaitGroup) { + // TODO: check if symlink, and handle appropriately + // TODO: extract error out of function + + if file.info.IsDir() { + entries, err := os.ReadDir(file.path) + if err != nil { + panic(err) + } + + wg.Add(len(entries)) + for _, entry := range entries { + entryInfo, err := entry.Info() + if err != nil { + panic(err) + } + // PERF: prevents deadlock but introduces an additional goroutine overhead per file + go func(path string) { + visitQueue <- infoPath{path: path, info: entryInfo} + }(file.path + "/" + entry.Name()) + } + } else if file.info.Mode().IsRegular() { + filterQueue <- file + } + + wg.Done() +} + +func workerTraverse(wg *sync.WaitGroup, visitQueue chan infoPath, filterQueue chan<- infoPath) { + for work := range visitQueue { + visit(work, visitQueue, filterQueue, wg) + } +} + +func (idx Index) Traverse(numWorkers uint) []string { + if numWorkers <= 1 { + panic(fmt.Sprint("Invalid number of workers: ", numWorkers)) + } + docs := make([]string, 0) + + rootInfo, err := os.Stat(idx.Root) + if err != nil { + panic(err) + } + + jobs := make(chan infoPath, numWorkers) + filterQueue := make(chan infoPath, numWorkers) + + activeJobs := &sync.WaitGroup{} + + // start workers + for range numWorkers { + go workerTraverse(activeJobs, jobs, filterQueue) + } + + // init send + activeJobs.Add(1) + jobs <- infoPath{path: idx.Root, info: rootInfo} + + // TODO: close jobs queue + go func() { + activeJobs.Wait() + close(jobs) + close(filterQueue) + }() + + // gather + for doc := range filterQueue { + docs = append(docs, doc.path) + } + + return docs +} + +func (idx Index) FilterOne(path string) bool { + info, err := os.Stat(string(path)) + if err != nil { + return false + } + + f, err := os.Open(string(path)) + if err != nil { + return false + } + defer f.Close() + + for _, filter := range idx.Filters { + if !filter(infoPath{string(path), info}, f) { + return false + } + if _, err := f.Seek(0, io.SeekStart); err != nil { + return false + } + } + return true +} + +func (idx Index) Filter(paths []string, numWorkers uint) []string { + fPaths := make([]string, 0, len(paths)) + jobs := make(chan string, numWorkers) + accepted := make(chan string, numWorkers) + wg := &sync.WaitGroup{} + + wg.Add(int(numWorkers)) + for range numWorkers { + go func(jobs <-chan string, accepted chan<- string, wg *sync.WaitGroup) { + for path := range jobs { + if idx.FilterOne(path) { + accepted <- path + } + } + wg.Done() + }(jobs, accepted, wg) + } + + go func(jobs chan<- string) { + for _, path := range paths { + jobs <- path + } + close(jobs) + }(jobs) + + go func() { + wg.Wait() + close(accepted) + }() + + for path := range accepted { + fPaths = append(fPaths, path) + } + + return fPaths +} + +func (idx Index) ParseOne(path string) (*Document, error) { + doc := &Document{} + doc.Path = path + + f, err := os.Open(string(path)) + if err != nil { + return nil, err + } + + info, err := f.Stat() + if err != nil { + return nil, err + } + doc.FileTime = info.ModTime() + + buf := make([]byte, 4, 1024) + n, err := f.Read(buf) + if err != nil { + return nil, err + } else if n != 4 { + return nil, errors.New("Short read") + } + + // TODO: implement custom unmarshaller, for singular `Author` + dec := yaml.NewDecoder(f) + // TODO: handle no yaml header error + if err := dec.Decode(&doc); err != nil { + panic(err) + } + + // TODO: body parsing + + return doc, nil +} + +func (idx Index) Parse(paths []string, numWorkers uint) { + jobs := make(chan string, numWorkers) + results := make(chan Document, numWorkers) + wg := &sync.WaitGroup{} + + wg.Add(int(numWorkers)) + for range numWorkers { + go func(jobs <-chan string, results chan<- Document, wg *sync.WaitGroup) { + for path := range jobs { + doc, err := idx.ParseOne(path) + if err != nil { + // TODO: propagate error + panic(err) + } + + results <- *doc + } + wg.Done() + }(jobs, results, wg) + } + + go func(jobs chan<- string, paths []string) { + for _, path := range paths { + jobs <- path + } + close(jobs) + }(jobs, paths) + + go func(results chan Document, wg *sync.WaitGroup) { + wg.Wait() + close(results) + }(results, wg) + + for doc := range results { + idx.Documents[doc.Path] = &doc + } +} diff --git a/pkg/index/index_test.go b/pkg/index/index_test.go new file mode 100644 index 0000000..0b5d2f2 --- /dev/null +++ b/pkg/index/index_test.go @@ -0,0 +1,138 @@ +package index + +import ( + "fmt" + "os" + "slices" + "testing" +) + +var indexCases map[string]func(t *testing.T) Index + +func init() { + indexCases = make(map[string]func(t *testing.T) Index) + + indexCases["single file"] = func(t *testing.T) Index { + root := t.TempDir() + index := Index{Root: root, Filters: []DocFilter{NewExtensionFilter(".md")}} + + f, err := os.Create(root + "/a_file.md") + if err != nil { + t.Fatal(err) + } + f.WriteString("some file contents\n") + + return index + } + + indexCases["large file"] = func(t *testing.T) Index { + root := t.TempDir() + index := Index{Root: root} + + return index + } + + indexCases["worker saturation"] = func(t *testing.T) Index { + root := t.TempDir() + index := Index{Root: root} + + permission := os.FileMode(0o777) + for _, dirName := range []string{"a", "b", "c", "d", "e", "f"} { + dir := root + "/" + dirName + if err := os.Mkdir(dir, permission); err != nil { + t.Fatal(err) + } + for i := range 8 { + fName := fmt.Sprint(dirName, i) + f, err := os.Create(dir + "/" + fName) + if err != nil { + t.Fatal(err) + } + f.WriteString(fName) + } + } + + return index + } +} + +func TestIndex_Traverse(t *testing.T) { + tests := []struct { + name string + indexCase func(t *testing.T) Index + numWorkers uint + want []string + }{ + {name: "single file", indexCase: indexCases["single file"], numWorkers: 2, want: []string{"a_file.md"}}, + {name: "saturation test", indexCase: indexCases["worker saturation"], numWorkers: 2, want: []string{ + "a/a0", "a/a1", "a/a2", "a/a3", "a/a4", "a/a5", "a/a6", "a/a7", + "b/b0", "b/b1", "b/b2", "b/b3", "b/b4", "b/b5", "b/b6", "b/b7", + "c/c0", "c/c1", "c/c2", "c/c3", "c/c4", "c/c5", "c/c6", "c/c7", + "d/d0", "d/d1", "d/d2", "d/d3", "d/d4", "d/d5", "d/d6", "d/d7", + "e/e0", "e/e1", "e/e2", "e/e3", "e/e4", "e/e5", "e/e6", "e/e7", + "f/f0", "f/f1", "f/f2", "f/f3", "f/f4", "f/f5", "f/f6", "f/f7", + }}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + idx := tt.indexCase(t) + got := idx.Traverse(tt.numWorkers) + + slices.Sort(got) + slices.Sort(tt.want) + + n := min(len(got), len(tt.want)) + if len(got) != len(tt.want) { + t.Errorf("Wanted %v got %v paths", len(tt.want), len(got)) + t.Logf("Checking up to %d values", n) + } + + for i := range n { + gotPath := got[i] + wantPath := idx.Root + "/" + tt.want[i] + if gotPath != wantPath { + t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath) + } + } + }) + } +} + +func TestIndex_Filter(t *testing.T) { + tests := []struct { + name string + paths []string + indexCase func(t *testing.T) Index + numWorkers uint + want []string + }{ + {"single file", []string{"a_file.md"}, indexCases["single file"], 2, []string{"a_file.md"}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + idx := tt.indexCase(t) + for i, path := range tt.paths { + tt.paths[i] = idx.Root + "/" + path + } + + got := idx.Filter(tt.paths, tt.numWorkers) + + slices.Sort(got) + slices.Sort(tt.want) + + n := min(len(got), len(tt.want)) + if len(got) != len(tt.want) { + t.Errorf("Wanted %v got %v paths", len(tt.want), len(got)) + t.Logf("Checking up to %d values", n) + } + + for i := range n { + gotPath := got[i] + wantPath := idx.Root + "/" + tt.want[i] + if gotPath != wantPath { + t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath) + } + } + }) + } +} diff --git a/pkg/index/schema.sql b/pkg/index/schema.sql new file mode 100644 index 0000000..fb06351 --- /dev/null +++ b/pkg/index/schema.sql @@ -0,0 +1,70 @@ +-- TABLE of config values +CREATE TABLE Indexes( + root TEXT NOT NULL, + followSym DATE +); + +-- Schema +CREATE TABLE Documents( + id INTEGER PRIMARY KEY, + path TEXT UNIQUE NOT NULL, + title TEXT, + date INT, + fileTime INT, + meta BLOB +); + +CREATE TABLE Authors( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL +); + +CREATE TABLE Aliases( + authorId INT NOT NULL, + alias TEXT UNIQUE NOT NULL, + FOREIGN KEY (authorId) REFERENCES Authors(id) +); + +CREATE TABLE Tags( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL, +); + +CREATE TABLE Links( + referencedId INT, + refererId INT, + FOREIGN KEY (referencedId) REFERENCES Documents(id), + FOREIGN KEY (refererId) REFERENCES Documents(id) +); + +CREATE TABLE DocumentAuthors( + docId INT NOT NULL, + authorId INT NOT NULL, + FOREIGN KEY (docId) REFERENCES Documents(id), + FOREIGN KEY (authorId) REFERENCES Authors(id) +); + +CREATE TABLE DocumentTags( + docId INT NOT NULL, + tagId INT NOT NULL, + FOREIGN KEY (docId) REFERENCES Documents(id), + FOREIGN KEY (tagId) REFERENCES Tags(id), + UNIQUE(docId, tagId) +); + +-- Indexes +CREATE INDEX idx_doc_dates +ON Documents (date); +CREATE INDEX idx_doc_titles +ON Documents (title); + +CREATE INDEX idx_author_name +ON Authors(name); + +CREATE INDEX idx_aliases_alias +ON Aliases(alias); +CREATE INDEX idx_aliases_authorId +ON Aliases(authorId); + +CREATE INDEX idx_doctags_tagid +ON DocumentTags (tagId); |
