diff options
| author | JP Appel <jeanpierre.appel01@gmail.com> | 2025-04-27 00:49:27 -0400 |
|---|---|---|
| committer | JP Appel <jeanpierre.appel01@gmail.com> | 2025-04-27 00:49:27 -0400 |
| commit | 34b8d8ff1f9d65c08a9156d72f08cf548183c6f4 (patch) | |
| tree | a00fa0410a7bcde125a37b50b3a4956c838fa569 /pkg/index/index.go | |
| parent | 42527fdb0aca0d30652bb3052b80ab75ab057572 (diff) | |
Large commit; many features
Diffstat (limited to 'pkg/index/index.go')
| -rw-r--r-- | pkg/index/index.go | 275 |
1 files changed, 275 insertions, 0 deletions
diff --git a/pkg/index/index.go b/pkg/index/index.go new file mode 100644 index 0000000..074b659 --- /dev/null +++ b/pkg/index/index.go @@ -0,0 +1,275 @@ +package index + +import ( + "errors" + "fmt" + "io" + "os" + "slices" + "sync" + "time" + + "github.com/goccy/go-yaml" +) + +type Document struct { + Path string + Title string `yaml:"title"` + Date time.Time `yaml:"date"` + FileTime time.Time + Authors []string `yaml:"authors"` + Tags []string `yaml:"tags"` + Links []string + OtherMeta string // unsure about how to handle this +} + +type infoPath struct { + path string + info os.FileInfo +} + +type Index struct { + Root string // root directory for searching + Documents map[string]*Document + Filters []DocFilter +} + +func (idx Index) String() string { + // TODO: print info about active filters + return fmt.Sprintf("%s Documents[%d] Filters[%d]", idx.Root, len(idx.Documents), len(idx.Filters)) +} + +func (doc Document) Equal(other Document) bool { + if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) || !doc.FileTime.Equal(other.FileTime) { + return false + } + + if !slices.Equal(doc.Authors, other.Authors) { + return false + } + + slices.Sort(doc.Tags) + slices.Sort(other.Tags) + for i := range doc.Tags { + if doc.Tags[i] != other.Tags[i] { + return false + } + } + + slices.Sort(doc.Links) + slices.Sort(other.Links) + for i := range doc.Links { + if doc.Links[i] != other.Links[i] { + return false + } + } + + return true +} + +func visit(file infoPath, visitQueue chan<- infoPath, filterQueue chan<- infoPath, wg *sync.WaitGroup) { + // TODO: check if symlink, and handle appropriately + // TODO: extract error out of function + + if file.info.IsDir() { + entries, err := os.ReadDir(file.path) + if err != nil { + panic(err) + } + + wg.Add(len(entries)) + for _, entry := range entries { + entryInfo, err := entry.Info() + if err != nil { + panic(err) + } + // PERF: prevents deadlock but introduces an additional goroutine overhead per file + go func(path string) { + visitQueue <- infoPath{path: path, info: entryInfo} + }(file.path + "/" + entry.Name()) + } + } else if file.info.Mode().IsRegular() { + filterQueue <- file + } + + wg.Done() +} + +func workerTraverse(wg *sync.WaitGroup, visitQueue chan infoPath, filterQueue chan<- infoPath) { + for work := range visitQueue { + visit(work, visitQueue, filterQueue, wg) + } +} + +func (idx Index) Traverse(numWorkers uint) []string { + if numWorkers <= 1 { + panic(fmt.Sprint("Invalid number of workers: ", numWorkers)) + } + docs := make([]string, 0) + + rootInfo, err := os.Stat(idx.Root) + if err != nil { + panic(err) + } + + jobs := make(chan infoPath, numWorkers) + filterQueue := make(chan infoPath, numWorkers) + + activeJobs := &sync.WaitGroup{} + + // start workers + for range numWorkers { + go workerTraverse(activeJobs, jobs, filterQueue) + } + + // init send + activeJobs.Add(1) + jobs <- infoPath{path: idx.Root, info: rootInfo} + + // TODO: close jobs queue + go func() { + activeJobs.Wait() + close(jobs) + close(filterQueue) + }() + + // gather + for doc := range filterQueue { + docs = append(docs, doc.path) + } + + return docs +} + +func (idx Index) FilterOne(path string) bool { + info, err := os.Stat(string(path)) + if err != nil { + return false + } + + f, err := os.Open(string(path)) + if err != nil { + return false + } + defer f.Close() + + for _, filter := range idx.Filters { + if !filter(infoPath{string(path), info}, f) { + return false + } + if _, err := f.Seek(0, io.SeekStart); err != nil { + return false + } + } + return true +} + +func (idx Index) Filter(paths []string, numWorkers uint) []string { + fPaths := make([]string, 0, len(paths)) + jobs := make(chan string, numWorkers) + accepted := make(chan string, numWorkers) + wg := &sync.WaitGroup{} + + wg.Add(int(numWorkers)) + for range numWorkers { + go func(jobs <-chan string, accepted chan<- string, wg *sync.WaitGroup) { + for path := range jobs { + if idx.FilterOne(path) { + accepted <- path + } + } + wg.Done() + }(jobs, accepted, wg) + } + + go func(jobs chan<- string) { + for _, path := range paths { + jobs <- path + } + close(jobs) + }(jobs) + + go func() { + wg.Wait() + close(accepted) + }() + + for path := range accepted { + fPaths = append(fPaths, path) + } + + return fPaths +} + +func (idx Index) ParseOne(path string) (*Document, error) { + doc := &Document{} + doc.Path = path + + f, err := os.Open(string(path)) + if err != nil { + return nil, err + } + + info, err := f.Stat() + if err != nil { + return nil, err + } + doc.FileTime = info.ModTime() + + buf := make([]byte, 4, 1024) + n, err := f.Read(buf) + if err != nil { + return nil, err + } else if n != 4 { + return nil, errors.New("Short read") + } + + // TODO: implement custom unmarshaller, for singular `Author` + dec := yaml.NewDecoder(f) + // TODO: handle no yaml header error + if err := dec.Decode(&doc); err != nil { + panic(err) + } + + // TODO: body parsing + + return doc, nil +} + +func (idx Index) Parse(paths []string, numWorkers uint) { + jobs := make(chan string, numWorkers) + results := make(chan Document, numWorkers) + wg := &sync.WaitGroup{} + + wg.Add(int(numWorkers)) + for range numWorkers { + go func(jobs <-chan string, results chan<- Document, wg *sync.WaitGroup) { + for path := range jobs { + doc, err := idx.ParseOne(path) + if err != nil { + // TODO: propagate error + panic(err) + } + + results <- *doc + } + wg.Done() + }(jobs, results, wg) + } + + go func(jobs chan<- string, paths []string) { + for _, path := range paths { + jobs <- path + } + close(jobs) + }(jobs, paths) + + go func(results chan Document, wg *sync.WaitGroup) { + wg.Wait() + close(results) + }(results, wg) + + for doc := range results { + idx.Documents[doc.Path] = &doc + } +} |
