aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/index
diff options
context:
space:
mode:
authorJP Appel <jeanpierre.appel01@gmail.com>2025-04-27 00:49:27 -0400
committerJP Appel <jeanpierre.appel01@gmail.com>2025-04-27 00:49:27 -0400
commit34b8d8ff1f9d65c08a9156d72f08cf548183c6f4 (patch)
treea00fa0410a7bcde125a37b50b3a4956c838fa569 /pkg/index
parent42527fdb0aca0d30652bb3052b80ab75ab057572 (diff)
Large commit; many features
Diffstat (limited to 'pkg/index')
-rw-r--r--pkg/index/filters.go84
-rw-r--r--pkg/index/filters_test.go112
-rw-r--r--pkg/index/index.go275
-rw-r--r--pkg/index/index_test.go138
-rw-r--r--pkg/index/schema.sql70
5 files changed, 679 insertions, 0 deletions
diff --git a/pkg/index/filters.go b/pkg/index/filters.go
new file mode 100644
index 0000000..f59a5d6
--- /dev/null
+++ b/pkg/index/filters.go
@@ -0,0 +1,84 @@
+package index
+
+import (
+ "io"
+ "path/filepath"
+)
+
+// TODO: create excluded path filter factory
+
+type DocFilter func(infoPath, io.ReadSeeker) bool
+
+func NewExtensionFilter(ext string) DocFilter {
+ return func(ip infoPath, _ io.ReadSeeker) bool {
+ return filepath.Ext(ip.path) == ext
+ }
+}
+
+func NewMaxFilesizeFilter(size int64) DocFilter {
+ return func(ip infoPath, _ io.ReadSeeker) bool {
+ return ip.info.Size() <= size
+ }
+}
+
+func YamlHeaderFilter(_ infoPath, r io.ReadSeeker) bool {
+ const bufSize = 4096
+ buf := make([]byte, bufSize)
+
+ carry := make([]byte, 4)
+ cmp := make([]byte, 4)
+ n, err := r.Read(carry)
+ if err != nil || n < 4 || string(carry) != "---\n" {
+ return false
+ }
+
+ headerFound := false
+ readMore := true
+ for readMore {
+ buf = buf[:bufSize]
+ n, err := r.Read(buf)
+ if err == io.EOF {
+ readMore = false
+ } else if err != nil {
+ return false
+ }
+ buf = buf[:n]
+
+ // PERF: the carry doesn't need to be checked on the first loop iteration
+ for i := range min(4, n) {
+ b := carry[i]
+ for j := range 4 {
+ if i+j < 4 {
+ cmp[j] = carry[i+j]
+ } else {
+ cmp[j] = buf[(i+j)%4]
+ }
+ }
+ if b == '\n' && string(cmp) == "\n---\n" {
+ headerFound = true
+ readMore = false
+ break
+ }
+ }
+ for i := range n - 4 {
+ b := buf[i]
+ if b == '\n' && string(buf[i:i+5]) == "\n---\n" {
+ headerFound = true
+ readMore = false
+ break
+ }
+ }
+
+ if readMore {
+ for i := range 4 {
+ carry[i] = buf[n-4+i]
+ }
+ }
+ }
+
+ return headerFound
+}
+
+func DefaultFilters() []DocFilter {
+ return []DocFilter{NewExtensionFilter(".md"), NewMaxFilesizeFilter(200 * 1024)}
+}
diff --git a/pkg/index/filters_test.go b/pkg/index/filters_test.go
new file mode 100644
index 0000000..897a82f
--- /dev/null
+++ b/pkg/index/filters_test.go
@@ -0,0 +1,112 @@
+package index
+
+import (
+ "bytes"
+ "io"
+ "os"
+ "testing"
+)
+
+func noYamlHeader() io.ReadSeeker {
+ buf := []byte("just some text")
+ return bytes.NewReader(buf)
+}
+
+func incompleteYamlHeader() io.ReadSeeker {
+ buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---")
+ return bytes.NewReader(buf)
+}
+
+func completeYamlHeader() io.ReadSeeker {
+ buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\n")
+ return bytes.NewReader(buf)
+}
+
+func trailingYamlHeader() io.ReadSeeker {
+ buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\nhere are some content\nanother line of text")
+ return bytes.NewReader(buf)
+}
+
+func extensionless(t *testing.T) infoPath {
+ root := t.TempDir()
+ path := root + "/" + "afile"
+ f, err := os.Create(path)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer f.Close()
+
+ if _, err := f.WriteString("this is a file"); err != nil {
+ t.Fatal(err)
+ }
+
+ info, err := f.Stat()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ return infoPath{path, info}
+}
+
+func markdownExtension(t *testing.T) infoPath {
+ root := t.TempDir()
+ path := root + "/" + "a.md"
+ f, err := os.Create(path)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer f.Close()
+
+ info, err := f.Stat()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ return infoPath{path, info}
+}
+
+func TestYamlHeaderFilter(t *testing.T) {
+ tests := []struct {
+ name string
+ r io.ReadSeeker
+ want bool
+ }{
+ {"completeYamlHeader", completeYamlHeader(), true},
+ {"trailingYamlHeader", trailingYamlHeader(), true},
+ {"noYamlHeader", noYamlHeader(), false},
+ {"incompleteYamlHeader", incompleteYamlHeader(), false},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := YamlHeaderFilter(infoPath{}, tt.r)
+ if got != tt.want {
+ t.Errorf("YamlHeaderFilter() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
+
+func TestExtensionFilter(t *testing.T) {
+ tests := []struct {
+ name string
+ infoGen func(*testing.T) infoPath
+ ext string
+ want bool
+ }{
+ {"no extension, accept .md", extensionless, ".md", false},
+ {"no extension, accept all", extensionless, "", true},
+ {"markdown, accept .md", markdownExtension, ".md", true},
+ {"makdown, accept .png", markdownExtension, ".png", false},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ filter := NewExtensionFilter(tt.ext)
+ ip := tt.infoGen(t)
+ got := filter(ip, nil)
+
+ if got != tt.want {
+ t.Errorf("ExtensionFilter() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
diff --git a/pkg/index/index.go b/pkg/index/index.go
new file mode 100644
index 0000000..074b659
--- /dev/null
+++ b/pkg/index/index.go
@@ -0,0 +1,275 @@
+package index
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "slices"
+ "sync"
+ "time"
+
+ "github.com/goccy/go-yaml"
+)
+
+type Document struct {
+ Path string
+ Title string `yaml:"title"`
+ Date time.Time `yaml:"date"`
+ FileTime time.Time
+ Authors []string `yaml:"authors"`
+ Tags []string `yaml:"tags"`
+ Links []string
+ OtherMeta string // unsure about how to handle this
+}
+
+type infoPath struct {
+ path string
+ info os.FileInfo
+}
+
+type Index struct {
+ Root string // root directory for searching
+ Documents map[string]*Document
+ Filters []DocFilter
+}
+
+func (idx Index) String() string {
+ // TODO: print info about active filters
+ return fmt.Sprintf("%s Documents[%d] Filters[%d]", idx.Root, len(idx.Documents), len(idx.Filters))
+}
+
+func (doc Document) Equal(other Document) bool {
+ if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) || !doc.FileTime.Equal(other.FileTime) {
+ return false
+ }
+
+ if !slices.Equal(doc.Authors, other.Authors) {
+ return false
+ }
+
+ slices.Sort(doc.Tags)
+ slices.Sort(other.Tags)
+ for i := range doc.Tags {
+ if doc.Tags[i] != other.Tags[i] {
+ return false
+ }
+ }
+
+ slices.Sort(doc.Links)
+ slices.Sort(other.Links)
+ for i := range doc.Links {
+ if doc.Links[i] != other.Links[i] {
+ return false
+ }
+ }
+
+ return true
+}
+
+func visit(file infoPath, visitQueue chan<- infoPath, filterQueue chan<- infoPath, wg *sync.WaitGroup) {
+ // TODO: check if symlink, and handle appropriately
+ // TODO: extract error out of function
+
+ if file.info.IsDir() {
+ entries, err := os.ReadDir(file.path)
+ if err != nil {
+ panic(err)
+ }
+
+ wg.Add(len(entries))
+ for _, entry := range entries {
+ entryInfo, err := entry.Info()
+ if err != nil {
+ panic(err)
+ }
+ // PERF: prevents deadlock but introduces an additional goroutine overhead per file
+ go func(path string) {
+ visitQueue <- infoPath{path: path, info: entryInfo}
+ }(file.path + "/" + entry.Name())
+ }
+ } else if file.info.Mode().IsRegular() {
+ filterQueue <- file
+ }
+
+ wg.Done()
+}
+
+func workerTraverse(wg *sync.WaitGroup, visitQueue chan infoPath, filterQueue chan<- infoPath) {
+ for work := range visitQueue {
+ visit(work, visitQueue, filterQueue, wg)
+ }
+}
+
+func (idx Index) Traverse(numWorkers uint) []string {
+ if numWorkers <= 1 {
+ panic(fmt.Sprint("Invalid number of workers: ", numWorkers))
+ }
+ docs := make([]string, 0)
+
+ rootInfo, err := os.Stat(idx.Root)
+ if err != nil {
+ panic(err)
+ }
+
+ jobs := make(chan infoPath, numWorkers)
+ filterQueue := make(chan infoPath, numWorkers)
+
+ activeJobs := &sync.WaitGroup{}
+
+ // start workers
+ for range numWorkers {
+ go workerTraverse(activeJobs, jobs, filterQueue)
+ }
+
+ // init send
+ activeJobs.Add(1)
+ jobs <- infoPath{path: idx.Root, info: rootInfo}
+
+ // TODO: close jobs queue
+ go func() {
+ activeJobs.Wait()
+ close(jobs)
+ close(filterQueue)
+ }()
+
+ // gather
+ for doc := range filterQueue {
+ docs = append(docs, doc.path)
+ }
+
+ return docs
+}
+
+func (idx Index) FilterOne(path string) bool {
+ info, err := os.Stat(string(path))
+ if err != nil {
+ return false
+ }
+
+ f, err := os.Open(string(path))
+ if err != nil {
+ return false
+ }
+ defer f.Close()
+
+ for _, filter := range idx.Filters {
+ if !filter(infoPath{string(path), info}, f) {
+ return false
+ }
+ if _, err := f.Seek(0, io.SeekStart); err != nil {
+ return false
+ }
+ }
+ return true
+}
+
+func (idx Index) Filter(paths []string, numWorkers uint) []string {
+ fPaths := make([]string, 0, len(paths))
+ jobs := make(chan string, numWorkers)
+ accepted := make(chan string, numWorkers)
+ wg := &sync.WaitGroup{}
+
+ wg.Add(int(numWorkers))
+ for range numWorkers {
+ go func(jobs <-chan string, accepted chan<- string, wg *sync.WaitGroup) {
+ for path := range jobs {
+ if idx.FilterOne(path) {
+ accepted <- path
+ }
+ }
+ wg.Done()
+ }(jobs, accepted, wg)
+ }
+
+ go func(jobs chan<- string) {
+ for _, path := range paths {
+ jobs <- path
+ }
+ close(jobs)
+ }(jobs)
+
+ go func() {
+ wg.Wait()
+ close(accepted)
+ }()
+
+ for path := range accepted {
+ fPaths = append(fPaths, path)
+ }
+
+ return fPaths
+}
+
+func (idx Index) ParseOne(path string) (*Document, error) {
+ doc := &Document{}
+ doc.Path = path
+
+ f, err := os.Open(string(path))
+ if err != nil {
+ return nil, err
+ }
+
+ info, err := f.Stat()
+ if err != nil {
+ return nil, err
+ }
+ doc.FileTime = info.ModTime()
+
+ buf := make([]byte, 4, 1024)
+ n, err := f.Read(buf)
+ if err != nil {
+ return nil, err
+ } else if n != 4 {
+ return nil, errors.New("Short read")
+ }
+
+ // TODO: implement custom unmarshaller, for singular `Author`
+ dec := yaml.NewDecoder(f)
+ // TODO: handle no yaml header error
+ if err := dec.Decode(&doc); err != nil {
+ panic(err)
+ }
+
+ // TODO: body parsing
+
+ return doc, nil
+}
+
+func (idx Index) Parse(paths []string, numWorkers uint) {
+ jobs := make(chan string, numWorkers)
+ results := make(chan Document, numWorkers)
+ wg := &sync.WaitGroup{}
+
+ wg.Add(int(numWorkers))
+ for range numWorkers {
+ go func(jobs <-chan string, results chan<- Document, wg *sync.WaitGroup) {
+ for path := range jobs {
+ doc, err := idx.ParseOne(path)
+ if err != nil {
+ // TODO: propagate error
+ panic(err)
+ }
+
+ results <- *doc
+ }
+ wg.Done()
+ }(jobs, results, wg)
+ }
+
+ go func(jobs chan<- string, paths []string) {
+ for _, path := range paths {
+ jobs <- path
+ }
+ close(jobs)
+ }(jobs, paths)
+
+ go func(results chan Document, wg *sync.WaitGroup) {
+ wg.Wait()
+ close(results)
+ }(results, wg)
+
+ for doc := range results {
+ idx.Documents[doc.Path] = &doc
+ }
+}
diff --git a/pkg/index/index_test.go b/pkg/index/index_test.go
new file mode 100644
index 0000000..0b5d2f2
--- /dev/null
+++ b/pkg/index/index_test.go
@@ -0,0 +1,138 @@
+package index
+
+import (
+ "fmt"
+ "os"
+ "slices"
+ "testing"
+)
+
+var indexCases map[string]func(t *testing.T) Index
+
+func init() {
+ indexCases = make(map[string]func(t *testing.T) Index)
+
+ indexCases["single file"] = func(t *testing.T) Index {
+ root := t.TempDir()
+ index := Index{Root: root, Filters: []DocFilter{NewExtensionFilter(".md")}}
+
+ f, err := os.Create(root + "/a_file.md")
+ if err != nil {
+ t.Fatal(err)
+ }
+ f.WriteString("some file contents\n")
+
+ return index
+ }
+
+ indexCases["large file"] = func(t *testing.T) Index {
+ root := t.TempDir()
+ index := Index{Root: root}
+
+ return index
+ }
+
+ indexCases["worker saturation"] = func(t *testing.T) Index {
+ root := t.TempDir()
+ index := Index{Root: root}
+
+ permission := os.FileMode(0o777)
+ for _, dirName := range []string{"a", "b", "c", "d", "e", "f"} {
+ dir := root + "/" + dirName
+ if err := os.Mkdir(dir, permission); err != nil {
+ t.Fatal(err)
+ }
+ for i := range 8 {
+ fName := fmt.Sprint(dirName, i)
+ f, err := os.Create(dir + "/" + fName)
+ if err != nil {
+ t.Fatal(err)
+ }
+ f.WriteString(fName)
+ }
+ }
+
+ return index
+ }
+}
+
+func TestIndex_Traverse(t *testing.T) {
+ tests := []struct {
+ name string
+ indexCase func(t *testing.T) Index
+ numWorkers uint
+ want []string
+ }{
+ {name: "single file", indexCase: indexCases["single file"], numWorkers: 2, want: []string{"a_file.md"}},
+ {name: "saturation test", indexCase: indexCases["worker saturation"], numWorkers: 2, want: []string{
+ "a/a0", "a/a1", "a/a2", "a/a3", "a/a4", "a/a5", "a/a6", "a/a7",
+ "b/b0", "b/b1", "b/b2", "b/b3", "b/b4", "b/b5", "b/b6", "b/b7",
+ "c/c0", "c/c1", "c/c2", "c/c3", "c/c4", "c/c5", "c/c6", "c/c7",
+ "d/d0", "d/d1", "d/d2", "d/d3", "d/d4", "d/d5", "d/d6", "d/d7",
+ "e/e0", "e/e1", "e/e2", "e/e3", "e/e4", "e/e5", "e/e6", "e/e7",
+ "f/f0", "f/f1", "f/f2", "f/f3", "f/f4", "f/f5", "f/f6", "f/f7",
+ }},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ idx := tt.indexCase(t)
+ got := idx.Traverse(tt.numWorkers)
+
+ slices.Sort(got)
+ slices.Sort(tt.want)
+
+ n := min(len(got), len(tt.want))
+ if len(got) != len(tt.want) {
+ t.Errorf("Wanted %v got %v paths", len(tt.want), len(got))
+ t.Logf("Checking up to %d values", n)
+ }
+
+ for i := range n {
+ gotPath := got[i]
+ wantPath := idx.Root + "/" + tt.want[i]
+ if gotPath != wantPath {
+ t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath)
+ }
+ }
+ })
+ }
+}
+
+func TestIndex_Filter(t *testing.T) {
+ tests := []struct {
+ name string
+ paths []string
+ indexCase func(t *testing.T) Index
+ numWorkers uint
+ want []string
+ }{
+ {"single file", []string{"a_file.md"}, indexCases["single file"], 2, []string{"a_file.md"}},
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ idx := tt.indexCase(t)
+ for i, path := range tt.paths {
+ tt.paths[i] = idx.Root + "/" + path
+ }
+
+ got := idx.Filter(tt.paths, tt.numWorkers)
+
+ slices.Sort(got)
+ slices.Sort(tt.want)
+
+ n := min(len(got), len(tt.want))
+ if len(got) != len(tt.want) {
+ t.Errorf("Wanted %v got %v paths", len(tt.want), len(got))
+ t.Logf("Checking up to %d values", n)
+ }
+
+ for i := range n {
+ gotPath := got[i]
+ wantPath := idx.Root + "/" + tt.want[i]
+ if gotPath != wantPath {
+ t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath)
+ }
+ }
+ })
+ }
+}
diff --git a/pkg/index/schema.sql b/pkg/index/schema.sql
new file mode 100644
index 0000000..fb06351
--- /dev/null
+++ b/pkg/index/schema.sql
@@ -0,0 +1,70 @@
+-- TABLE of config values
+CREATE TABLE Indexes(
+ root TEXT NOT NULL,
+ followSym DATE
+);
+
+-- Schema
+CREATE TABLE Documents(
+ id INTEGER PRIMARY KEY,
+ path TEXT UNIQUE NOT NULL,
+ title TEXT,
+ date INT,
+ fileTime INT,
+ meta BLOB
+);
+
+CREATE TABLE Authors(
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL
+);
+
+CREATE TABLE Aliases(
+ authorId INT NOT NULL,
+ alias TEXT UNIQUE NOT NULL,
+ FOREIGN KEY (authorId) REFERENCES Authors(id)
+);
+
+CREATE TABLE Tags(
+ id INTEGER PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL,
+);
+
+CREATE TABLE Links(
+ referencedId INT,
+ refererId INT,
+ FOREIGN KEY (referencedId) REFERENCES Documents(id),
+ FOREIGN KEY (refererId) REFERENCES Documents(id)
+);
+
+CREATE TABLE DocumentAuthors(
+ docId INT NOT NULL,
+ authorId INT NOT NULL,
+ FOREIGN KEY (docId) REFERENCES Documents(id),
+ FOREIGN KEY (authorId) REFERENCES Authors(id)
+);
+
+CREATE TABLE DocumentTags(
+ docId INT NOT NULL,
+ tagId INT NOT NULL,
+ FOREIGN KEY (docId) REFERENCES Documents(id),
+ FOREIGN KEY (tagId) REFERENCES Tags(id),
+ UNIQUE(docId, tagId)
+);
+
+-- Indexes
+CREATE INDEX idx_doc_dates
+ON Documents (date);
+CREATE INDEX idx_doc_titles
+ON Documents (title);
+
+CREATE INDEX idx_author_name
+ON Authors(name);
+
+CREATE INDEX idx_aliases_alias
+ON Aliases(alias);
+CREATE INDEX idx_aliases_authorId
+ON Aliases(authorId);
+
+CREATE INDEX idx_doctags_tagid
+ON DocumentTags (tagId);