aboutsummaryrefslogtreecommitdiffstats
path: root/pkg
diff options
context:
space:
mode:
authorJP Appel <jeanpierre.appel01@gmail.com>2025-07-02 00:06:10 -0400
committerJP Appel <jeanpierre.appel01@gmail.com>2025-07-02 00:06:10 -0400
commitba68130862dc004a7a1b50d99fc70872d39fd065 (patch)
treeef4f08b1d91ff86f717fff4b30474048f77ae4c1 /pkg
parent4582265de0c0472755880652dc7b390b342cf3e0 (diff)
Add link parsing
Diffstat (limited to 'pkg')
-rw-r--r--pkg/data/db.go20
-rw-r--r--pkg/data/get.go18
-rw-r--r--pkg/data/get_test.go22
-rw-r--r--pkg/data/put.go30
-rw-r--r--pkg/data/put_test.go77
-rw-r--r--pkg/index/index.go31
-rw-r--r--pkg/index/index_test.go25
7 files changed, 170 insertions, 53 deletions
diff --git a/pkg/data/db.go b/pkg/data/db.go
index 24a8793..9c8c5b1 100644
--- a/pkg/data/db.go
+++ b/pkg/data/db.go
@@ -19,7 +19,7 @@ type Query struct {
//
// output is in the form
//
-// <query> <start><(n-1)*(<val><delim)>><val><delim><stop>
+// <query> <start><(n-1)*(<val><delim)>><val><stop>
func BatchQuery[T any](query string, start string, val string, delim string, stop string, n int, baseArgs []T) (string, []any) {
args := make([]any, len(baseArgs))
for i, arg := range baseArgs {
@@ -139,10 +139,10 @@ func createSchema(db *sql.DB) error {
_, err = tx.Exec(`
CREATE TABLE IF NOT EXISTS Links(
- referencedId INT,
- refererId INT,
- FOREIGN KEY (referencedId) REFERENCES Documents(id),
- FOREIGN KEY (refererId) REFERENCES Documents(id)
+ docId INT,
+ link TEXT NOT NULL,
+ FOREIGN KEY (docId) REFERENCES Documents(id),
+ UNIQUE(docId, link)
)`)
if err != nil {
tx.Rollback()
@@ -198,6 +198,12 @@ func createSchema(db *sql.DB) error {
return err
}
+ _, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_links_link ON Links(link)")
+ if err != nil {
+ tx.Rollback()
+ return err
+ }
+
_, err = tx.Exec("CREATE INDEX IF NOT EXISTS idx_doctags_tagid ON DocumentTags (tagId)")
if err != nil {
tx.Rollback()
@@ -214,13 +220,15 @@ func createSchema(db *sql.DB) error {
d.fileTime,
d.meta,
COALESCE(a.name, al.alias) AS author,
- t.name AS tag
+ t.name AS tag,
+ l.link
FROM Documents d
LEFT JOIN DocumentAuthors da ON d.id = da.docId
LEFT JOIN Authors a ON da.authorId = a.id
LEFT JOIN Aliases al ON a.id = al.authorId
LEFT JOIN DocumentTags dt ON d.id = dt.docId
LEFT JOIN Tags t ON dt.tagId = t.id
+ LEFT JOIN Links l ON d.id = l.docId
`)
if err != nil {
tx.Rollback()
diff --git a/pkg/data/get.go b/pkg/data/get.go
index 09d4587..8dafb24 100644
--- a/pkg/data/get.go
+++ b/pkg/data/get.go
@@ -108,6 +108,8 @@ func (f *FillMany) documents(ctx context.Context, rows *sql.Rows) error {
return err
}
defer rows.Close()
+ } else {
+ // TODO: check if rows.ColumnTypes() matches expected
}
var id int
@@ -273,11 +275,9 @@ func (f FillMany) tags(ctx context.Context) error {
func (f Fill) links(ctx context.Context) error {
rows, err := f.Db.QueryContext(ctx, `
- SELECT path
- FROM Documents
- JOIN Links
- ON Links.referencedId = Documents.id
- WHERE Links.refererId = ?
+ SELECT link
+ FROM Links
+ WHERE Links.docId = ?
`, f.id)
if err != nil {
return err
@@ -299,11 +299,9 @@ func (f Fill) links(ctx context.Context) error {
func (f FillMany) links(ctx context.Context) error {
stmt, err := f.Db.PrepareContext(ctx, `
- SELECT path
- FROM Documents
- JOIN Links
- ON Links.referencedId = Documents.id
- WHERE Links.refererId = ?
+ SELECT link
+ FROM Links
+ WHERE Links.docId = ?
`)
if err != nil {
return err
diff --git a/pkg/data/get_test.go b/pkg/data/get_test.go
index 14d6920..22e5af2 100644
--- a/pkg/data/get_test.go
+++ b/pkg/data/get_test.go
@@ -1,7 +1,6 @@
package data_test
import (
- "context"
"database/sql"
"errors"
"testing"
@@ -57,6 +56,13 @@ func singleDoc(t *testing.T) *sql.DB {
t.Fatal("err inserting docTags:", err)
}
+ if _, err := db.Exec(`
+ INSERT INTO Links (docId, link)
+ VALUES (1, 'link1'), (1, 'link2')
+ `); err != nil {
+ t.Fatal("err inserting links:", err)
+ }
+
return db
}
@@ -106,6 +112,13 @@ func multiDoc(t *testing.T) *sql.DB {
t.Fatal("err inserting docTags:", err)
}
+ if _, err := db.Exec(`
+ INSERT INTO Links (docId, link)
+ VALUES (1, '/home'), (2, 'rsync://rsync.kernel.org/pub/')
+ `); err != nil {
+ t.Fatal("err inserting links:", err)
+ }
+
return db
}
@@ -129,6 +142,7 @@ func TestFill_Get(t *testing.T) {
FileTime: time.Unix(2, 0),
Authors: []string{"jp"},
Tags: []string{"foo", "bar", "oof", "baz"},
+ Links: []string{"link1", "link2"},
},
nil,
},
@@ -136,7 +150,7 @@ func TestFill_Get(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
f := tt.newFill(t)
- got, gotErr := f.Get(context.Background())
+ got, gotErr := f.Get(t.Context())
if !errors.Is(gotErr, tt.wantErr) {
t.Fatalf("Recieved unexpected error: got %v want %v", gotErr, tt.wantErr)
@@ -172,6 +186,7 @@ func TestFillMany_Get(t *testing.T) {
FileTime: time.Unix(2, 0),
Authors: []string{"jp"},
Tags: []string{"foo", "baz"},
+ Links: []string{"/home"},
},
"README.md": {
Path: "README.md",
@@ -180,6 +195,7 @@ func TestFillMany_Get(t *testing.T) {
FileTime: time.Unix(4, 0),
Authors: []string{"anonymous", "jp"},
Tags: []string{"bar", "oof"},
+ Links: []string{"rsync://rsync.kernel.org/pub/"},
},
},
nil,
@@ -187,7 +203,7 @@ func TestFillMany_Get(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- ctx := context.Background()
+ ctx := t.Context()
f := tt.newFillMany(t)
got, gotErr := f.Get(ctx)
diff --git a/pkg/data/put.go b/pkg/data/put.go
index e0185ae..0d49e60 100644
--- a/pkg/data/put.go
+++ b/pkg/data/put.go
@@ -226,14 +226,13 @@ func (p Put) links() error {
return nil
}
- preQuery := fmt.Sprintf(`
- INSERT INTO Links (referencedId, refererId)
- SELECT id, %d
- FROM Documents
- WHERE path IN
- `, p.Id)
- query, args := BatchQuery(preQuery, "(", "?", ",", ")", len(p.Doc.Links), p.Doc.Links)
- if _, err := p.tx.Exec(query, args...); err != nil {
+ preQuery := `
+ INSERT INTO Links (docId, link)
+ VALUES
+ `
+ valueStr := fmt.Sprintf("(%d,?)", p.Id)
+ query, args := BatchQuery(preQuery, "", valueStr, ",", "", len(p.Doc.Links), p.Doc.Links)
+ if _, err := p.tx.Exec(query + "\n ON CONFLICT DO NOTHING", args...); err != nil {
return err
}
@@ -251,14 +250,13 @@ func (p PutMany) links(ctx context.Context) error {
continue
}
- preQuery := fmt.Sprintf(`
- INSERT INTO Links (referencedId, refererId)
- SELECT id, %d
- FROM Documents
- WHERE path IN
- `, id)
- query, args := BatchQuery(preQuery, "(", "?", ",", ")", len(doc.Links), doc.Links)
- if _, err := tx.Exec(query, args...); err != nil {
+ preQuery := `
+ INSERT INTO Links (docId, link)
+ VALUES
+ `
+ valueStr := fmt.Sprintf("(%d,?)", id)
+ query, args := BatchQuery(preQuery, "", valueStr, ",", "", len(doc.Links), doc.Links)
+ if _, err := tx.Exec(query +"\n ON CONFLICT DO NOTHING", args...); err != nil {
tx.Rollback()
return err
}
diff --git a/pkg/data/put_test.go b/pkg/data/put_test.go
index 7e5ad38..a97187b 100644
--- a/pkg/data/put_test.go
+++ b/pkg/data/put_test.go
@@ -1,7 +1,6 @@
package data_test
import (
- "context"
"database/sql"
"errors"
"testing"
@@ -31,13 +30,14 @@ func TestPut_Insert(t *testing.T) {
FileTime: time.Unix(2, 0),
Authors: []string{"jp"},
Tags: []string{"foo", "bar", "oof", "baz"},
+ Links: []string{"link_1", "link_2", "link_3"},
},
nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- ctx := context.Background()
+ ctx := t.Context()
db := tt.newDb(t)
defer db.Close()
@@ -68,31 +68,76 @@ func TestPut_Insert(t *testing.T) {
func TestPutMany_Insert(t *testing.T) {
tests := []struct {
- name string // description of this test case
- // Named input parameters for receiver constructor.
- db *sql.DB
+ name string
+ newDb func(t *testing.T) *sql.DB
documents map[string]*index.Document
- wantErr bool
+ wantErr error
}{
- // TODO: Add test cases.
+ {
+ name: "insert on empty",
+ newDb: func(t *testing.T) *sql.DB {
+ t.Helper()
+ return data.NewMemDB()
+ },
+ documents: map[string]*index.Document{
+ "/file": {
+ Path: "/file",
+ Title: "A file",
+ Date: time.Unix(1, 0),
+ FileTime: time.Unix(2, 0),
+ Authors: []string{"jp"},
+ Tags: []string{"foo", "bar", "oof", "baz"},
+ Links: []string{"link_1", "link_2", "link_3"},
+ },
+ "/file2": {
+ Path: "/file2",
+ Title: "A different file",
+ Date: time.Unix(3, 0),
+ FileTime: time.Unix(4, 0),
+ Authors: []string{"pj"},
+ Tags: []string{"apple", "pear", "peach"},
+ Links: []string{"a very useful link"},
+ },
+ },
+ wantErr: nil,
+ },
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- p, err := data.NewPutMany(tt.db, tt.documents)
+ db := tt.newDb(t)
+ p, err := data.NewPutMany(db, tt.documents)
if err != nil {
t.Fatalf("could not construct receiver type: %v", err)
}
- gotErr := p.Insert(context.Background())
- if gotErr != nil {
- if !tt.wantErr {
- t.Errorf("Insert() failed: %v", gotErr)
- }
+
+ gotErr := p.Insert(t.Context())
+ if !errors.Is(gotErr, tt.wantErr) {
+ t.Fatalf("Recieved unexpected error, got %v want %v", gotErr, tt.wantErr)
+ } else if err != nil {
return
}
- if tt.wantErr {
- t.Fatal("Insert() succeeded unexpectedly")
+
+ f := data.FillMany{Db: db}
+ gotDocs, err := f.Get(t.Context())
+ if err != nil {
+ t.Fatal("Error while retrieving documents for comparison:", err)
+ }
+
+ wantLen, gotLen := len(tt.documents), len(gotDocs)
+ if wantLen != gotLen {
+ t.Fatalf("Recieved differnt number of documents than expected: got %d, want %d", gotLen, wantLen)
+ }
+
+ for path, wantDoc := range tt.documents {
+ gotDoc, ok := gotDocs[path]
+ if !ok {
+ t.Errorf("Wanted doc with path %s but did not recieve it", path)
+ }
+
+ if !wantDoc.Equal(*gotDoc) {
+ t.Errorf("Difference betwen docs!\ngot: %+v\nwant: %+v", gotDoc, wantDoc)
+ }
}
})
}
}
-
diff --git a/pkg/index/index.go b/pkg/index/index.go
index d49636f..13c4f45 100644
--- a/pkg/index/index.go
+++ b/pkg/index/index.go
@@ -1,11 +1,13 @@
package index
import (
+ "bytes"
"errors"
"fmt"
"io"
"log/slog"
"os"
+ "regexp"
"slices"
"strings"
"sync"
@@ -17,6 +19,7 @@ import (
)
var ErrHeaderParse error = errors.New("Unable to parse YAML header")
+var linkRegex *regexp.Regexp
type Document struct {
Path string `yaml:"-" json:"path"`
@@ -32,6 +35,7 @@ type Document struct {
type ParseOpts struct {
ParseMeta bool
+ ParseLinks bool
IgnoreDateError bool
IgnoreMetaError bool
}
@@ -345,12 +349,31 @@ func ParseDoc(path string, opts ParseOpts) (*Document, error) {
if pos < 0 {
return nil, fmt.Errorf("Can't find YAML header in %s", path)
}
+ header := io.NewSectionReader(f, 0, pos)
- if err := yaml.NewDecoder(io.LimitReader(f, pos)).Decode(doc); err != nil {
+ if err := yaml.NewDecoder(header).Decode(doc); err != nil {
return nil, errors.Join(ErrHeaderParse, err)
}
- // TODO: read the rest of the file to find links
+ if opts.ParseLinks {
+ var buf bytes.Buffer
+ f.Seek(pos, io.SeekStart)
+ if _, err := io.Copy(&buf, f); err != nil {
+ return nil, err
+ }
+
+ matches := linkRegex.FindAllSubmatch(buf.Bytes(), -1)
+ for _, match := range matches {
+ if len(match) != 2 {
+ panic("Link parsing regex returned unexpected number of matches")
+ }
+ link := string(match[1])
+ if len(link) > 0 && len(strings.TrimSpace(link)) > 0 {
+ doc.Links = append(doc.Links, link)
+ }
+ }
+ }
+
return doc, nil
}
@@ -396,3 +419,7 @@ func ParseDocs(paths []string, numWorkers uint, opts ParseOpts) map[string]*Docu
return docs
}
+
+func init() {
+ linkRegex = regexp.MustCompile(`\[.*\]\((.*)\)`)
+}
diff --git a/pkg/index/index_test.go b/pkg/index/index_test.go
index 0a3239d..69600c2 100644
--- a/pkg/index/index_test.go
+++ b/pkg/index/index_test.go
@@ -251,6 +251,31 @@ func TestIndex_ParseOne(t *testing.T) {
nil,
},
{
+ "links",
+ func(t *testing.T) string {
+ f, path := newTestFile(t, "links")
+ defer f.Close()
+
+ f.WriteString("---\n")
+ f.WriteString("title: Link test\n")
+ f.WriteString("---\n")
+ f.WriteString(`
+ Here are some words in a *markdown* file.
+ In this sentence there is a valid [hyperlink](https://jpappel.xyz).
+ But in this sentence, the [link]() should not get parsed.
+ The same is true for the [link]( ) in this sentence.
+ `)
+
+ return path
+ },
+ index.ParseOpts{ParseLinks: true},
+ &index.Document{
+ Title: "Link test",
+ Links: []string{"https://jpappel.xyz"},
+ },
+ nil,
+ },
+ {
"bad tags",
func(t *testing.T) string {
f, path := newTestFile(t, "badtags")