Large commit; many features

author: JP Appel <jeanpierre.appel01@gmail.com> 2025-04-27 00:49:27 -0400
committer: JP Appel <jeanpierre.appel01@gmail.com> 2025-04-27 00:49:27 -0400
commit: 34b8d8ff1f9d65c08a9156d72f08cf548183c6f4 (patch)
tree: a00fa0410a7bcde125a37b50b3a4956c838fa569 /pkg/index
parent: 42527fdb0aca0d30652bb3052b80ab75ab057572 (diff)
5 files changed, 679 insertions, 0 deletions
diff --git a/pkg/index/filters.go b/pkg/index/filters.go
new file mode 100644
index 0000000..f59a5d6
--- /dev/null
+++ b/pkg/index/filters.go
@@ -0,0 +1,84 @@
+package index
+
+import (
+	"io"
+	"path/filepath"
+)
+
+// TODO: create excluded path filter factory
+
+type DocFilter func(infoPath, io.ReadSeeker) bool
+
+func NewExtensionFilter(ext string) DocFilter {
+	return func(ip infoPath, _ io.ReadSeeker) bool {
+		return filepath.Ext(ip.path) == ext
+	}
+}
+
+func NewMaxFilesizeFilter(size int64) DocFilter {
+	return func(ip infoPath, _ io.ReadSeeker) bool {
+		return ip.info.Size() <= size
+	}
+}
+
+func YamlHeaderFilter(_ infoPath, r io.ReadSeeker) bool {
+	const bufSize = 4096
+	buf := make([]byte, bufSize)
+
+	carry := make([]byte, 4)
+	cmp := make([]byte, 4)
+	n, err := r.Read(carry)
+	if err != nil || n < 4 || string(carry) != "---\n" {
+		return false
+	}
+
+	headerFound := false
+	readMore := true
+	for readMore {
+		buf = buf[:bufSize]
+		n, err := r.Read(buf)
+		if err == io.EOF {
+			readMore = false
+		} else if err != nil {
+			return false
+		}
+		buf = buf[:n]
+
+		// PERF: the carry doesn't need to be checked on the first loop iteration
+		for i := range min(4, n) {
+			b := carry[i]
+			for j := range 4 {
+				if i+j < 4 {
+					cmp[j] = carry[i+j]
+				} else {
+					cmp[j] = buf[(i+j)%4]
+				}
+			}
+			if b == '\n' && string(cmp) == "\n---\n" {
+				headerFound = true
+				readMore = false
+				break
+			}
+		}
+		for i := range n - 4 {
+			b := buf[i]
+			if b == '\n' && string(buf[i:i+5]) == "\n---\n" {
+				headerFound = true
+				readMore = false
+				break
+			}
+		}
+
+		if readMore {
+			for i := range 4 {
+				carry[i] = buf[n-4+i]
+			}
+		}
+	}
+
+	return headerFound
+}
+
+func DefaultFilters() []DocFilter {
+	return []DocFilter{NewExtensionFilter(".md"), NewMaxFilesizeFilter(200 * 1024)}
+}
diff --git a/pkg/index/filters_test.go b/pkg/index/filters_test.go
new file mode 100644
index 0000000..897a82f
--- /dev/null
+++ b/pkg/index/filters_test.go
@@ -0,0 +1,112 @@
+package index
+
+import (
+	"bytes"
+	"io"
+	"os"
+	"testing"
+)
+
+func noYamlHeader() io.ReadSeeker {
+	buf := []byte("just some text")
+	return bytes.NewReader(buf)
+}
+
+func incompleteYamlHeader() io.ReadSeeker {
+	buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---")
+	return bytes.NewReader(buf)
+}
+
+func completeYamlHeader() io.ReadSeeker {
+	buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\n")
+	return bytes.NewReader(buf)
+}
+
+func trailingYamlHeader() io.ReadSeeker {
+	buf := []byte("---\nfoo:bar\ntitle:bizbaz\nauthor:\n-JP Appel\n---\nhere are some content\nanother line of text")
+	return bytes.NewReader(buf)
+}
+
+func extensionless(t *testing.T) infoPath {
+	root := t.TempDir()
+	path := root + "/" + "afile"
+	f, err := os.Create(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	if _, err := f.WriteString("this is a file"); err != nil {
+		t.Fatal(err)
+	}
+
+	info, err := f.Stat()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	return infoPath{path, info}
+}
+
+func markdownExtension(t *testing.T) infoPath {
+	root := t.TempDir()
+	path := root + "/" + "a.md"
+	f, err := os.Create(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	info, err := f.Stat()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	return infoPath{path, info}
+}
+
+func TestYamlHeaderFilter(t *testing.T) {
+	tests := []struct {
+		name string
+		r    io.ReadSeeker
+		want bool
+	}{
+		{"completeYamlHeader", completeYamlHeader(), true},
+		{"trailingYamlHeader", trailingYamlHeader(), true},
+		{"noYamlHeader", noYamlHeader(), false},
+		{"incompleteYamlHeader", incompleteYamlHeader(), false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := YamlHeaderFilter(infoPath{}, tt.r)
+			if got != tt.want {
+				t.Errorf("YamlHeaderFilter() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestExtensionFilter(t *testing.T) {
+	tests := []struct {
+		name    string
+		infoGen func(*testing.T) infoPath
+		ext     string
+		want    bool
+	}{
+		{"no extension, accept .md", extensionless, ".md", false},
+		{"no extension, accept all", extensionless, "", true},
+		{"markdown, accept .md", markdownExtension, ".md", true},
+		{"makdown, accept .png", markdownExtension, ".png", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			filter := NewExtensionFilter(tt.ext)
+			ip := tt.infoGen(t)
+			got := filter(ip, nil)
+
+			if got != tt.want {
+				t.Errorf("ExtensionFilter() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/pkg/index/index.go b/pkg/index/index.go
new file mode 100644
index 0000000..074b659
--- /dev/null
+++ b/pkg/index/index.go
@@ -0,0 +1,275 @@
+package index
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"slices"
+	"sync"
+	"time"
+
+	"github.com/goccy/go-yaml"
+)
+
+type Document struct {
+	Path      string
+	Title     string    `yaml:"title"`
+	Date      time.Time `yaml:"date"`
+	FileTime  time.Time
+	Authors   []string `yaml:"authors"`
+	Tags      []string `yaml:"tags"`
+	Links     []string
+	OtherMeta string // unsure about how to handle this
+}
+
+type infoPath struct {
+	path string
+	info os.FileInfo
+}
+
+type Index struct {
+	Root      string // root directory for searching
+	Documents map[string]*Document
+	Filters   []DocFilter
+}
+
+func (idx Index) String() string {
+	// TODO: print info about active filters
+	return fmt.Sprintf("%s Documents[%d] Filters[%d]", idx.Root, len(idx.Documents), len(idx.Filters))
+}
+
+func (doc Document) Equal(other Document) bool {
+	if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) || !doc.FileTime.Equal(other.FileTime) {
+		return false
+	}
+
+	if !slices.Equal(doc.Authors, other.Authors) {
+		return false
+	}
+
+	slices.Sort(doc.Tags)
+	slices.Sort(other.Tags)
+	for i := range doc.Tags {
+		if doc.Tags[i] != other.Tags[i] {
+			return false
+		}
+	}
+
+	slices.Sort(doc.Links)
+	slices.Sort(other.Links)
+	for i := range doc.Links {
+		if doc.Links[i] != other.Links[i] {
+			return false
+		}
+	}
+
+	return true
+}
+
+func visit(file infoPath, visitQueue chan<- infoPath, filterQueue chan<- infoPath, wg *sync.WaitGroup) {
+	// TODO: check if symlink, and handle appropriately
+	// TODO: extract error out of function
+
+	if file.info.IsDir() {
+		entries, err := os.ReadDir(file.path)
+		if err != nil {
+			panic(err)
+		}
+
+		wg.Add(len(entries))
+		for _, entry := range entries {
+			entryInfo, err := entry.Info()
+			if err != nil {
+				panic(err)
+			}
+			// PERF: prevents deadlock but introduces an additional goroutine overhead per file
+			go func(path string) {
+				visitQueue <- infoPath{path: path, info: entryInfo}
+			}(file.path + "/" + entry.Name())
+		}
+	} else if file.info.Mode().IsRegular() {
+		filterQueue <- file
+	}
+
+	wg.Done()
+}
+
+func workerTraverse(wg *sync.WaitGroup, visitQueue chan infoPath, filterQueue chan<- infoPath) {
+	for work := range visitQueue {
+		visit(work, visitQueue, filterQueue, wg)
+	}
+}
+
+func (idx Index) Traverse(numWorkers uint) []string {
+	if numWorkers <= 1 {
+		panic(fmt.Sprint("Invalid number of workers: ", numWorkers))
+	}
+	docs := make([]string, 0)
+
+	rootInfo, err := os.Stat(idx.Root)
+	if err != nil {
+		panic(err)
+	}
+
+	jobs := make(chan infoPath, numWorkers)
+	filterQueue := make(chan infoPath, numWorkers)
+
+	activeJobs := &sync.WaitGroup{}
+
+	// start workers
+	for range numWorkers {
+		go workerTraverse(activeJobs, jobs, filterQueue)
+	}
+
+	// init send
+	activeJobs.Add(1)
+	jobs <- infoPath{path: idx.Root, info: rootInfo}
+
+	// TODO: close jobs queue
+	go func() {
+		activeJobs.Wait()
+		close(jobs)
+		close(filterQueue)
+	}()
+
+	// gather
+	for doc := range filterQueue {
+		docs = append(docs, doc.path)
+	}
+
+	return docs
+}
+
+func (idx Index) FilterOne(path string) bool {
+	info, err := os.Stat(string(path))
+	if err != nil {
+		return false
+	}
+
+	f, err := os.Open(string(path))
+	if err != nil {
+		return false
+	}
+	defer f.Close()
+
+	for _, filter := range idx.Filters {
+		if !filter(infoPath{string(path), info}, f) {
+			return false
+		}
+		if _, err := f.Seek(0, io.SeekStart); err != nil {
+			return false
+		}
+	}
+	return true
+}
+
+func (idx Index) Filter(paths []string, numWorkers uint) []string {
+	fPaths := make([]string, 0, len(paths))
+	jobs := make(chan string, numWorkers)
+	accepted := make(chan string, numWorkers)
+	wg := &sync.WaitGroup{}
+
+	wg.Add(int(numWorkers))
+	for range numWorkers {
+		go func(jobs <-chan string, accepted chan<- string, wg *sync.WaitGroup) {
+			for path := range jobs {
+				if idx.FilterOne(path) {
+					accepted <- path
+				}
+			}
+			wg.Done()
+		}(jobs, accepted, wg)
+	}
+
+	go func(jobs chan<- string) {
+		for _, path := range paths {
+			jobs <- path
+		}
+		close(jobs)
+	}(jobs)
+
+	go func() {
+		wg.Wait()
+		close(accepted)
+	}()
+
+	for path := range accepted {
+		fPaths = append(fPaths, path)
+	}
+
+	return fPaths
+}
+
+func (idx Index) ParseOne(path string) (*Document, error) {
+	doc := &Document{}
+	doc.Path = path
+
+	f, err := os.Open(string(path))
+	if err != nil {
+		return nil, err
+	}
+
+	info, err := f.Stat()
+	if err != nil {
+		return nil, err
+	}
+	doc.FileTime = info.ModTime()
+
+	buf := make([]byte, 4, 1024)
+	n, err := f.Read(buf)
+	if err != nil {
+		return nil, err
+	} else if n != 4 {
+		return nil, errors.New("Short read")
+	}
+
+	// TODO: implement custom unmarshaller, for singular `Author`
+	dec := yaml.NewDecoder(f)
+	// TODO: handle no yaml header error
+	if err := dec.Decode(&doc); err != nil {
+		panic(err)
+	}
+
+	// TODO: body parsing
+
+	return doc, nil
+}
+
+func (idx Index) Parse(paths []string, numWorkers uint) {
+	jobs := make(chan string, numWorkers)
+	results := make(chan Document, numWorkers)
+	wg := &sync.WaitGroup{}
+
+	wg.Add(int(numWorkers))
+	for range numWorkers {
+		go func(jobs <-chan string, results chan<- Document, wg *sync.WaitGroup) {
+			for path := range jobs {
+				doc, err := idx.ParseOne(path)
+				if err != nil {
+					// TODO: propagate error
+					panic(err)
+				}
+
+				results <- *doc
+			}
+			wg.Done()
+		}(jobs, results, wg)
+	}
+
+	go func(jobs chan<- string, paths []string) {
+		for _, path := range paths {
+			jobs <- path
+		}
+		close(jobs)
+	}(jobs, paths)
+
+	go func(results chan Document, wg *sync.WaitGroup) {
+		wg.Wait()
+		close(results)
+	}(results, wg)
+
+	for doc := range results {
+		idx.Documents[doc.Path] = &doc
+	}
+}
diff --git a/pkg/index/index_test.go b/pkg/index/index_test.go
new file mode 100644
index 0000000..0b5d2f2
--- /dev/null
+++ b/pkg/index/index_test.go
@@ -0,0 +1,138 @@
+package index
+
+import (
+	"fmt"
+	"os"
+	"slices"
+	"testing"
+)
+
+var indexCases map[string]func(t *testing.T) Index
+
+func init() {
+	indexCases = make(map[string]func(t *testing.T) Index)
+
+	indexCases["single file"] = func(t *testing.T) Index {
+		root := t.TempDir()
+		index := Index{Root: root, Filters: []DocFilter{NewExtensionFilter(".md")}}
+
+		f, err := os.Create(root + "/a_file.md")
+		if err != nil {
+			t.Fatal(err)
+		}
+		f.WriteString("some file contents\n")
+
+		return index
+	}
+
+	indexCases["large file"] = func(t *testing.T) Index {
+		root := t.TempDir()
+		index := Index{Root: root}
+
+		return index
+	}
+
+	indexCases["worker saturation"] = func(t *testing.T) Index {
+		root := t.TempDir()
+		index := Index{Root: root}
+
+		permission := os.FileMode(0o777)
+		for _, dirName := range []string{"a", "b", "c", "d", "e", "f"} {
+			dir := root + "/" + dirName
+			if err := os.Mkdir(dir, permission); err != nil {
+				t.Fatal(err)
+			}
+			for i := range 8 {
+				fName := fmt.Sprint(dirName, i)
+				f, err := os.Create(dir + "/" + fName)
+				if err != nil {
+					t.Fatal(err)
+				}
+				f.WriteString(fName)
+			}
+		}
+
+		return index
+	}
+}
+
+func TestIndex_Traverse(t *testing.T) {
+	tests := []struct {
+		name       string
+		indexCase  func(t *testing.T) Index
+		numWorkers uint
+		want       []string
+	}{
+		{name: "single file", indexCase: indexCases["single file"], numWorkers: 2, want: []string{"a_file.md"}},
+		{name: "saturation test", indexCase: indexCases["worker saturation"], numWorkers: 2, want: []string{
+			"a/a0", "a/a1", "a/a2", "a/a3", "a/a4", "a/a5", "a/a6", "a/a7",
+			"b/b0", "b/b1", "b/b2", "b/b3", "b/b4", "b/b5", "b/b6", "b/b7",
+			"c/c0", "c/c1", "c/c2", "c/c3", "c/c4", "c/c5", "c/c6", "c/c7",
+			"d/d0", "d/d1", "d/d2", "d/d3", "d/d4", "d/d5", "d/d6", "d/d7",
+			"e/e0", "e/e1", "e/e2", "e/e3", "e/e4", "e/e5", "e/e6", "e/e7",
+			"f/f0", "f/f1", "f/f2", "f/f3", "f/f4", "f/f5", "f/f6", "f/f7",
+		}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			idx := tt.indexCase(t)
+			got := idx.Traverse(tt.numWorkers)
+
+			slices.Sort(got)
+			slices.Sort(tt.want)
+
+			n := min(len(got), len(tt.want))
+			if len(got) != len(tt.want) {
+				t.Errorf("Wanted %v got %v paths", len(tt.want), len(got))
+				t.Logf("Checking up to %d values", n)
+			}
+
+			for i := range n {
+				gotPath := got[i]
+				wantPath := idx.Root + "/" + tt.want[i]
+				if gotPath != wantPath {
+					t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath)
+				}
+			}
+		})
+	}
+}
+
+func TestIndex_Filter(t *testing.T) {
+	tests := []struct {
+		name       string
+		paths      []string
+		indexCase  func(t *testing.T) Index
+		numWorkers uint
+		want       []string
+	}{
+		{"single file", []string{"a_file.md"}, indexCases["single file"], 2, []string{"a_file.md"}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			idx := tt.indexCase(t)
+			for i, path := range tt.paths {
+				tt.paths[i] = idx.Root + "/" + path
+			}
+
+			got := idx.Filter(tt.paths, tt.numWorkers)
+
+			slices.Sort(got)
+			slices.Sort(tt.want)
+
+			n := min(len(got), len(tt.want))
+			if len(got) != len(tt.want) {
+				t.Errorf("Wanted %v got %v paths", len(tt.want), len(got))
+				t.Logf("Checking up to %d values", n)
+			}
+
+			for i := range n {
+				gotPath := got[i]
+				wantPath := idx.Root + "/" + tt.want[i]
+				if gotPath != wantPath {
+					t.Errorf("At %d wanted %v, got %v", i, wantPath, gotPath)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/index/schema.sql b/pkg/index/schema.sql
new file mode 100644
index 0000000..fb06351
--- /dev/null
+++ b/pkg/index/schema.sql
@@ -0,0 +1,70 @@
+-- TABLE of config values
+CREATE TABLE Indexes(
+    root TEXT NOT NULL,
+    followSym DATE
+);
+
+-- Schema
+CREATE TABLE Documents(
+    id INTEGER PRIMARY KEY,
+    path TEXT UNIQUE NOT NULL,
+    title TEXT,
+    date INT,
+    fileTime INT,
+    meta BLOB
+);
+
+CREATE TABLE Authors(
+    id INTEGER PRIMARY KEY,
+    name TEXT UNIQUE NOT NULL
+);
+
+CREATE TABLE Aliases(
+    authorId INT NOT NULL,
+    alias TEXT UNIQUE NOT NULL,
+    FOREIGN KEY (authorId) REFERENCES Authors(id)
+);
+
+CREATE TABLE Tags(
+    id INTEGER PRIMARY KEY,
+    name TEXT UNIQUE NOT NULL,
+);
+
+CREATE TABLE Links(
+    referencedId INT,
+    refererId INT,
+    FOREIGN KEY (referencedId) REFERENCES Documents(id),
+    FOREIGN KEY (refererId) REFERENCES Documents(id)
+);
+
+CREATE TABLE DocumentAuthors(
+    docId INT NOT NULL,
+    authorId INT NOT NULL,
+    FOREIGN KEY (docId) REFERENCES Documents(id),
+    FOREIGN KEY (authorId) REFERENCES Authors(id)
+);
+
+CREATE TABLE DocumentTags(
+    docId INT NOT NULL,
+    tagId INT NOT NULL,
+    FOREIGN KEY (docId) REFERENCES Documents(id),
+    FOREIGN KEY (tagId) REFERENCES Tags(id),
+    UNIQUE(docId, tagId)
+);
+
+-- Indexes
+CREATE INDEX idx_doc_dates
+ON Documents (date);
+CREATE INDEX idx_doc_titles
+ON Documents (title);
+
+CREATE INDEX idx_author_name
+ON Authors(name);
+
+CREATE INDEX idx_aliases_alias
+ON Aliases(alias);
+CREATE INDEX idx_aliases_authorId
+ON Aliases(authorId);
+
+CREATE INDEX idx_doctags_tagid
+ON DocumentTags (tagId);
author	JP Appel <jeanpierre.appel01@gmail.com>	2025-04-27 00:49:27 -0400
committer	JP Appel <jeanpierre.appel01@gmail.com>	2025-04-27 00:49:27 -0400
commit	34b8d8ff1f9d65c08a9156d72f08cf548183c6f4 (patch)
tree	a00fa0410a7bcde125a37b50b3a4956c838fa569 /pkg/index
parent	42527fdb0aca0d30652bb3052b80ab75ab057572 (diff)