aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/index
diff options
context:
space:
mode:
authorJP Appel <jeanpierre.appel01@gmail.com>2025-05-02 17:22:14 -0400
committerJP Appel <jeanpierre.appel01@gmail.com>2025-05-02 17:22:14 -0400
commit37a96c43f6df141dc745f239891f4163b8870c02 (patch)
tree1014bf3ca04b4ad078a7819ac97fb5d6afda921d /pkg/index
parent966a1162a56652b4d56ffe003af05161841fb192 (diff)
Implement YAML header parsing
Parses `title` and `tags` fields using default behavior. Custom parsing logic is used for `author` (single and multiauthor support), `date` (parses into `time.Time`), and meta (collects all other header fields into a YAML string).
Diffstat (limited to 'pkg/index')
-rw-r--r--pkg/index/filters.go14
-rw-r--r--pkg/index/index.go168
-rw-r--r--pkg/index/index_test.go142
3 files changed, 295 insertions, 29 deletions
diff --git a/pkg/index/filters.go b/pkg/index/filters.go
index f59a5d6..315d125 100644
--- a/pkg/index/filters.go
+++ b/pkg/index/filters.go
@@ -5,6 +5,7 @@ import (
"path/filepath"
)
+// NOTE: in the future it would be interesting lua filters
// TODO: create excluded path filter factory
type DocFilter func(infoPath, io.ReadSeeker) bool
@@ -21,6 +22,17 @@ func NewMaxFilesizeFilter(size int64) DocFilter {
}
}
+func NewFilenameFilter(excluded []string) DocFilter {
+ excludedSet := make(map[string]bool, len(excluded))
+ for _, filename := range excluded {
+ excludedSet[filename] = true
+ }
+ return func(ip infoPath, _ io.ReadSeeker) bool {
+ _, ok := excludedSet[filepath.Base(ip.path)]
+ return ok
+ }
+}
+
func YamlHeaderFilter(_ infoPath, r io.ReadSeeker) bool {
const bufSize = 4096
buf := make([]byte, bufSize)
@@ -80,5 +92,5 @@ func YamlHeaderFilter(_ infoPath, r io.ReadSeeker) bool {
}
func DefaultFilters() []DocFilter {
- return []DocFilter{NewExtensionFilter(".md"), NewMaxFilesizeFilter(200 * 1024)}
+ return []DocFilter{NewExtensionFilter(".md"), NewMaxFilesizeFilter(200 * 1024), YamlHeaderFilter}
}
diff --git a/pkg/index/index.go b/pkg/index/index.go
index 4bab21b..d6839bd 100644
--- a/pkg/index/index.go
+++ b/pkg/index/index.go
@@ -4,23 +4,28 @@ import (
"errors"
"fmt"
"io"
+ "log/slog"
"os"
"slices"
+ "strings"
"sync"
"time"
"github.com/goccy/go-yaml"
+ "github.com/goccy/go-yaml/ast"
)
+var ErrHeaderParse error = errors.New("Unable to parse YAML header")
+
type Document struct {
Path string `yaml:"-" json:"path"`
Title string `yaml:"title" json:"title"`
- Date time.Time `yaml:"date" json:"date"`
+ Date time.Time `yaml:"-" json:"date"`
FileTime time.Time `yaml:"-" json:"filetime"`
- Authors []string `yaml:"authors" json:"authors"`
- Tags []string `yaml:"tags" json:"tags"`
- Links []string
- OtherMeta string // unsure about how to handle this
+ Authors []string `yaml:"-" json:"authors"`
+ Tags []string `yaml:"tags,omitempty" json:"tags"`
+ Links []string `yaml:"-" json:"links"`
+ OtherMeta string `yaml:"-" json:"meta"`
}
type infoPath struct {
@@ -39,8 +44,125 @@ func (idx Index) String() string {
return fmt.Sprintf("%s Documents[%d] Filters[%d]", idx.Root, len(idx.Documents), len(idx.Filters))
}
+var _ yaml.NodeUnmarshaler = (*Document)(nil)
+
+func (doc *Document) UnmarshalYAML(node ast.Node) error {
+ // parse top level fields
+ type alias Document
+ var temp alias
+ if err := yaml.NodeToValue(node, &temp); err != nil {
+ return err
+ }
+ doc.Title = temp.Title
+ doc.Tags = temp.Tags
+
+ mapnode, ok := node.(*ast.MappingNode)
+ if !ok {
+ return ErrHeaderParse
+ }
+
+ ignored_keyPaths := map[string]bool{
+ "$.title": true,
+ "$.tags": true,
+ }
+
+ buf := strings.Builder{}
+ for _, kv := range mapnode.Values {
+ k, v := kv.Key, kv.Value
+ keyPath := k.GetPath()
+
+ if ignored_keyPaths[keyPath] {
+ continue
+ }
+
+ if keyPath == "$.date" {
+ if err := doc.parseDateNode(v); err != nil {
+ return err
+ }
+ } else if keyPath == "$.author" {
+ if err := doc.parseAuthor(v); err != nil {
+ return err
+ }
+ } else {
+ field, err := kv.MarshalYAML()
+ if err != nil {
+ return err
+ }
+ buf.Write(field)
+ buf.WriteByte('\n')
+ }
+ }
+
+ doc.OtherMeta = buf.String()
+
+ return nil
+}
+
+func (doc *Document) parseDateNode(node ast.Node) error {
+ dateNode, ok := node.(*ast.StringNode)
+ if !ok {
+ return ErrHeaderParse
+ }
+ dateStr := dateNode.Value
+
+ if dateStr == "" {
+ return nil
+ }
+
+ dateFormats := []string{
+ "Jan _2, 2006",
+ "January 2, 2006",
+ time.DateOnly,
+ time.DateTime,
+ time.Layout,
+ time.ANSIC,
+ time.UnixDate,
+ time.RubyDate,
+ time.RFC822,
+ time.RFC822Z,
+ time.RFC850,
+ time.RFC1123,
+ time.RFC1123Z,
+ time.RFC3339,
+ }
+
+ var t time.Time
+ var err error
+ for _, layout := range dateFormats {
+ if t, err = time.Parse(layout, dateStr); err == nil {
+ doc.Date = t
+ return nil
+ }
+ }
+
+ return fmt.Errorf("Unable to parse date: %s", dateNode.Value)
+}
+
+func (doc *Document) parseAuthor(node ast.Node) error {
+ authorsNode, ok := node.(*ast.SequenceNode)
+ if ok {
+ doc.Authors = make([]string, 0, len(authorsNode.Values))
+ for _, authorNode := range authorsNode.Values {
+ authorStrNode, ok := authorNode.(*ast.StringNode)
+ if !ok {
+ return ErrHeaderParse
+ }
+ doc.Authors = append(doc.Authors, authorStrNode.Value)
+ }
+ } else {
+ authorNode, ok := node.(*ast.StringNode)
+ if ok {
+ doc.Authors = []string{authorNode.Value}
+ } else {
+ return ErrHeaderParse
+ }
+ }
+
+ return nil
+}
+
func (doc Document) Equal(other Document) bool {
- if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) || !doc.FileTime.Equal(other.FileTime) {
+ if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) {
return false
}
@@ -126,7 +248,7 @@ func (idx Index) Traverse(numWorkers uint) []string {
activeJobs.Add(1)
jobs <- infoPath{path: idx.Root, info: rootInfo}
- // TODO: close jobs queue
+ // close jobs queue
go func() {
activeJobs.Wait()
close(jobs)
@@ -201,6 +323,7 @@ func (idx Index) Filter(paths []string, numWorkers uint) []string {
return fPaths
}
+// TODO: extract from struct
func (idx Index) ParseOne(path string) (*Document, error) {
doc := &Document{}
doc.Path = path
@@ -209,6 +332,7 @@ func (idx Index) ParseOne(path string) (*Document, error) {
if err != nil {
return nil, err
}
+ defer f.Close()
info, err := f.Stat()
if err != nil {
@@ -216,31 +340,16 @@ func (idx Index) ParseOne(path string) (*Document, error) {
}
doc.FileTime = info.ModTime()
- buf := make([]byte, 4, 1024)
- n, err := f.Read(buf)
- if err != nil {
- return nil, err
- } else if n != 4 {
- return nil, errors.New("Short read")
+ if err := yaml.NewDecoder(f).Decode(doc); err != nil {
+ return nil, errors.Join(ErrHeaderParse, err)
}
- // FIXME: unmarshalling is **VERY** borked up rn
- if err := yaml.Unmarshal(buf, &doc); err != nil {
- return nil, err
- }
- // TODO: implement custom unmarshaller, for singular `Author`
- // dec := yaml.NewDecoder(f)
- // TODO: handle no yaml header error
- // if err := dec.Decode(&doc); err != nil {
- // panic(err)
- // }
-
- // TODO: body parsing
-
+ // TODO: read the rest of the file to find links
return doc, nil
}
-func (idx Index) Parse(paths []string, numWorkers uint) {
+// TODO: separate method from struct
+func (idx *Index) Parse(paths []string, numWorkers uint) {
jobs := make(chan string, numWorkers)
results := make(chan Document, numWorkers)
idx.Documents = make(map[string]*Document, len(paths))
@@ -253,7 +362,10 @@ func (idx Index) Parse(paths []string, numWorkers uint) {
doc, err := idx.ParseOne(path)
if err != nil {
// TODO: propagate error
- panic(err)
+ slog.Error("Error occured while parsing file",
+ slog.String("path", path), slog.String("err", err.Error()),
+ )
+ continue
}
results <- *doc
diff --git a/pkg/index/index_test.go b/pkg/index/index_test.go
index 0b5d2f2..ed7e550 100644
--- a/pkg/index/index_test.go
+++ b/pkg/index/index_test.go
@@ -1,10 +1,12 @@
package index
import (
+ "errors"
"fmt"
"os"
"slices"
"testing"
+ "time"
)
var indexCases map[string]func(t *testing.T) Index
@@ -136,3 +138,143 @@ func TestIndex_Filter(t *testing.T) {
})
}
}
+
+func newTestFile(t *testing.T, name string) (*os.File, string) {
+ dir := t.TempDir()
+ path := dir + "/" + name
+ f, err := os.Create(path)
+ if err != nil {
+ panic(err)
+ }
+
+ return f, path
+}
+
+func TestIndex_ParseOne(t *testing.T) {
+ tests := []struct {
+ name string
+ pathMaker func(t *testing.T) string
+ want *Document
+ wantErr error
+ }{
+ {
+ "title only header",
+ func(t *testing.T) string {
+ f, path := newTestFile(t, "title")
+ defer f.Close()
+
+ f.WriteString("---\ntitle: A title\n---\n")
+ return path
+ },
+ &Document{Title: "A title"},
+ nil,
+ },
+ {
+ "tags",
+ func(t *testing.T) string {
+ f, path := newTestFile(t, "tags")
+ defer f.Close()
+
+ f.WriteString("---\n")
+ f.WriteString("tags:\n")
+ f.WriteString("- a\n")
+ f.WriteString("- b\n")
+ f.WriteString("- c\n")
+ f.WriteString("---\n")
+
+ return path
+ },
+ &Document{Tags: []string{"a", "b", "c"}},
+ nil,
+ },
+ {
+ "date",
+ func(t *testing.T) string {
+ f, path := newTestFile(t, "date")
+ defer f.Close()
+
+ f.WriteString("---\ndate: May 1, 2025")
+
+ return path
+ },
+ &Document{Date: time.Date(2025, time.May, 1, 0, 0, 0, 0, time.UTC)},
+ nil,
+ },
+ {
+ "single author",
+ func(t *testing.T) string {
+ f, path := newTestFile(t, "author")
+ defer f.Close()
+
+ f.WriteString("---\nauthor: Rob Pike\n---\n")
+
+ return path
+ },
+ &Document{Authors: []string{"Rob Pike"}},
+ nil,
+ },
+ {
+ "multi author",
+ func(t *testing.T) string {
+ f, path := newTestFile(t, "author")
+ defer f.Close()
+
+ f.WriteString("---\nauthor:\n- Robert Griesemer\n- Rob Pike\n- Ken Thompson\n---\n")
+
+ return path
+ },
+ &Document{Authors: []string{"Robert Griesemer", "Rob Pike", "Ken Thompson"}},
+ nil,
+ },
+ {
+ "meta",
+ func(t *testing.T) string {
+ f, path := newTestFile(t, "metadata")
+ defer f.Close()
+
+ f.WriteString("---\n")
+ f.WriteString("unknownKey: value\n")
+ f.WriteString("---\n")
+
+ return path
+ },
+ &Document{OtherMeta: "unknownKey: value\n"},
+ nil,
+ },
+ {
+ "bad tags",
+ func(t *testing.T) string {
+ f, path := newTestFile(t, "badtags")
+ defer f.Close()
+
+ f.WriteString("---\n")
+ f.WriteString("tags:\n- good tag\n-bad tag\n")
+ f.WriteString("---\n")
+
+ return path
+ },
+ &Document{},
+ ErrHeaderParse,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ path := tt.pathMaker(t)
+ tt.want.Path = path
+
+ got, gotErr := Index{}.ParseOne(path)
+
+ if !errors.Is(gotErr,tt.wantErr) {
+ t.Errorf("Recieved unexpected error: want %v got %v", tt.wantErr, gotErr)
+ } else if gotErr != nil {
+ return
+ }
+
+ if !got.Equal(*tt.want) {
+ t.Error("Recieved document is not equal")
+ t.Logf("Got = %+v", got)
+ t.Logf("Want = %+v", tt.want)
+ }
+ })
+ }
+}