Add header parsing and improved link parsing

author: JP Appel <jeanpierre.appel01@gmail.com> 2025-08-05 12:03:32 -0400
committer: JP Appel <jeanpierre.appel01@gmail.com> 2025-08-05 12:03:32 -0400
commit: 62aeb1a0fb0a239f6193b7cb872787578480082c (patch)
tree: 889b9e3c3f27cbac2be335e30076a51fb548334a /pkg/index
parent: f14c466d5d5d1f1a68153162349a74a154bcb535 (diff)
2 files changed, 91 insertions, 21 deletions
diff --git a/pkg/index/index.go b/pkg/index/index.go
index b79ef2f..c69f36d 100644
--- a/pkg/index/index.go
+++ b/pkg/index/index.go
@@ -21,9 +21,8 @@ import (
 )
 
 var ErrHeaderParse error = errors.New("Unable to parse YAML header")
-var linkRegex *regexp.Regexp
+var DocParseRegex *regexp.Regexp
 
-// TODO: add headings field
 type Document struct {
 	Path      string    `yaml:"-" json:"path"`
 	Title     string    `yaml:"title" json:"title"`
@@ -32,17 +31,18 @@ type Document struct {
 	Authors   []string  `yaml:"-" json:"authors"`
 	Tags      []string  `yaml:"tags,omitempty" json:"tags"`
 	Links     []string  `yaml:"-" json:"links"`
+	Headings  string    `yaml:"-" json:"headings"`
 	OtherMeta string    `yaml:"-" json:"meta"`
 	parseOpts ParseOpts
 }
 
 type ParseOpts struct {
 	ParseMeta       bool
+	ParseHeadings   bool
 	ParseLinks      bool
 	IgnoreDateError bool
 	IgnoreMetaError bool
 	IgnoreHidden    bool
-	// TODO: add IgnoreHeadings
 }
 
 type InfoPath struct {
@@ -77,13 +77,13 @@ var _ yaml.BytesMarshaler = (*Document)(nil)
 func (doc *Document) MarshalYAML() ([]byte, error) {
 	return yaml.Marshal(yaml.MapSlice{
 		{Key: "path", Value: doc.Path},
-		// TODO: add headings
 		{Key: "title", Value: doc.Title},
 		{Key: "date", Value: doc.Date},
 		{Key: "filetime", Value: doc.FileTime},
 		{Key: "authors", Value: doc.Authors},
 		{Key: "tags", Value: doc.Tags},
 		{Key: "links", Value: doc.Links},
+		{Key: "headings", Value: doc.Headings},
 		{Key: "meta", Value: doc.OtherMeta},
 	})
 }
@@ -188,7 +188,7 @@ func (doc *Document) parseAuthor(node ast.Node) error {
 }
 
 func (doc Document) Equal(other Document) bool {
-	if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || !doc.Date.Equal(other.Date) {
+	if len(doc.Authors) != len(other.Authors) || len(doc.Tags) != len(other.Tags) || len(doc.Links) != len(other.Links) || doc.Path != other.Path || doc.Title != other.Title || doc.OtherMeta != other.OtherMeta || doc.Headings != other.Headings || !doc.Date.Equal(other.Date) {
 		return false
 	}
 
@@ -212,7 +212,6 @@ func (doc Document) Equal(other Document) bool {
 		}
 	}
 
-	// TODO: handle headings
 	return true
 }
 
@@ -269,23 +268,19 @@ func (idx Index) Traverse(numWorkers uint, ignoreHidden bool) []string {
 
 	activeJobs := &sync.WaitGroup{}
 
-	// start workers
 	for range numWorkers {
 		go workerTraverse(activeJobs, ignoreHidden, jobs, filterQueue)
 	}
 
-	// init send
 	activeJobs.Add(1)
 	jobs <- InfoPath{Path: idx.Root, Info: rootInfo}
 
-	// close jobs queue
 	go func() {
 		activeJobs.Wait()
 		close(jobs)
 		close(filterQueue)
 	}()
 
-	// gather
 	for doc := range filterQueue {
 		docs = append(docs, doc.Path)
 	}
@@ -381,7 +376,10 @@ func NewDocCmp(field string, reverse bool) (func(*Document, *Document) int, bool
 		return func(a, b *Document) int {
 			return descMod * strings.Compare(a.OtherMeta, b.OtherMeta)
 		}, true
-		// TODO: add headings
+	case "headings":
+		return func(a, b *Document) int {
+			return descMod * strings.Compare(a.Headings, b.Headings)
+		}, true
 	}
 
 	return nil, false
@@ -413,24 +411,44 @@ func ParseDoc(path string, opts ParseOpts) (*Document, error) {
 		return nil, errors.Join(ErrHeaderParse, err)
 	}
 
-	// TODO: parse headings simultaneously with links
-	if opts.ParseLinks {
+	if opts.ParseLinks || opts.ParseHeadings {
 		var buf bytes.Buffer
 		f.Seek(pos, io.SeekStart)
 		if _, err := io.Copy(&buf, f); err != nil {
 			return nil, err
 		}
 
-		matches := linkRegex.FindAllSubmatch(buf.Bytes(), -1)
+		const (
+			MATCH = iota
+			LH_HEADING
+			LH_LINK
+			HEADING
+			LINK
+		)
+
+		matches := DocParseRegex.FindAllSubmatch(buf.Bytes(), -1)
+		b := strings.Builder{}
 		for _, match := range matches {
-			if len(match) != 2 {
-				panic("Link parsing regex returned unexpected number of matches")
+			if opts.ParseHeadings {
+				if len(match[LH_HEADING]) != 0 {
+					b.Write(match[LH_HEADING])
+					b.WriteByte('\n')
+				} else if len(match[HEADING]) != 0 {
+					b.Write(match[HEADING])
+					b.WriteByte('\n')
+				}
 			}
-			link := string(match[1])
-			if len(link) > 0 {
-				doc.Links = append(doc.Links, link)
+
+			if opts.ParseLinks {
+				if len(match[LH_LINK]) != 0 {
+					doc.Links = append(doc.Links, string(match[LH_LINK]))
+				} else if len(match[LINK]) != 0 {
+					doc.Links = append(doc.Links, string(match[LINK]))
+				}
 			}
 		}
+
+		doc.Headings = b.String()
 	}
 
 	return doc, nil
@@ -482,5 +500,12 @@ func ParseDocs(paths []string, numWorkers uint, opts ParseOpts) (map[string]*Doc
 }
 
 func init() {
-	linkRegex = regexp.MustCompile(`\[.*\]\(\s*([^\)\s]+)\s*\)`)
+	headingPattern := `(?:^|\n)(?<heading>#{1,6}.*)`
+	linkPattern := `\[.*\]\(\s*(?<link>.*)\b\s*\)`
+	linkHeading := `(?:^|\n)(?<lh_heading>#{1,6}\s*\[.*\])\(\s*(?<lh_link>.*)\b\s*\)`
+	DocParseRegex = regexp.MustCompile(
+		linkHeading + "|" +
+			headingPattern + "|" +
+			linkPattern,
+	)
 }
diff --git a/pkg/index/index_test.go b/pkg/index/index_test.go
index 4242ce1..d3ca72c 100644
--- a/pkg/index/index_test.go
+++ b/pkg/index/index_test.go
@@ -264,6 +264,7 @@ func TestIndex_ParseOne(t *testing.T) {
 				In this sentence there is a valid [hyperlink](https://jpappel.xyz).
 				But in this sentence, the [link]() should not get parsed.
 				The same is true for the [link](       	) in this sentence.
+				There must be a nonwhitespace characters for a link to be a [valid link](   destination  )
 				`)
 
 				return path
@@ -271,7 +272,51 @@ func TestIndex_ParseOne(t *testing.T) {
 			index.ParseOpts{ParseLinks: true},
 			&index.Document{
 				Title: "Link test",
-				Links: []string{"https://jpappel.xyz"},
+				Links: []string{"https://jpappel.xyz", "destination"},
+			},
+			nil,
+		},
+		{
+			"headings",
+			func(t *testing.T) string {
+				f, path := newTestFile(t, "headings")
+				defer f.Close()
+
+				f.WriteString("---\n")
+				f.WriteString("title: Heading test\n")
+				f.WriteString("---\n")
+				f.WriteString("# A Heading\n")
+				f.WriteString("##Another Heading\n")
+				f.WriteString("### [Linked Heading](but no link parse)\n")
+				return path
+			},
+			index.ParseOpts{ParseHeadings: true},
+			&index.Document{
+				Title:    "Heading test",
+				Headings: "# A Heading\n##Another Heading\n### [Linked Heading]\n",
+			},
+			nil,
+		},
+		{
+			"linked_headings",
+			func(t *testing.T) string {
+				f, path := newTestFile(t, "linked_headings")
+				defer f.Close()
+
+				f.WriteString("---\n")
+				f.WriteString("title: Linked Heading Test\n")
+				f.WriteString("---\n")
+
+				f.WriteString("#[Top Level Heading](and its link)\n")
+				f.WriteString("## [Second Level heading](  sometext  )\n")
+
+				return path
+			},
+			index.ParseOpts{ParseLinks: true, ParseHeadings: true},
+			&index.Document{
+				Title:    "Linked Heading Test",
+				Headings: "#[Top Level Heading]\n## [Second Level heading]\n",
+				Links:    []string{"and its link", "sometext"},
 			},
 			nil,
 		},
author	JP Appel <jeanpierre.appel01@gmail.com>	2025-08-05 12:03:32 -0400
committer	JP Appel <jeanpierre.appel01@gmail.com>	2025-08-05 12:03:32 -0400
commit	62aeb1a0fb0a239f6193b7cb872787578480082c (patch)
tree	889b9e3c3f27cbac2be335e30076a51fb548334a /pkg/index
parent	f14c466d5d5d1f1a68153162349a74a154bcb535 (diff)