mirror of
https://github.com/osmarks/mycorrhiza.git
synced 2025-03-10 13:38:20 +00:00
318 lines
8.0 KiB
Go
318 lines
8.0 KiB
Go
// Package backlinks maintains the index of backlinks and lets you update it and query it.
|
|
package backlinks
|
|
|
|
import (
|
|
"log/slog"
|
|
"os"
|
|
"sort"
|
|
"time"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/bouncepaw/mycorrhiza/util"
|
|
"github.com/bouncepaw/mycorrhiza/internal/hyphae"
|
|
"github.com/bouncepaw/mycorrhiza/history"
|
|
"github.com/rivo/uniseg"
|
|
"github.com/dchest/stemmer/porter2"
|
|
)
|
|
|
|
// yieldHyphaBacklinks gets backlinks for the desired hypha, sorts and yields them one by one.
|
|
func yieldHyphaBacklinks(hyphaName string) <-chan string {
|
|
hyphaName = util.CanonicalName(hyphaName)
|
|
out := make(chan string)
|
|
sorted := hyphae.PathographicSort(out)
|
|
go func() {
|
|
backlinks, exists := backlinkIndex[hyphaName]
|
|
if exists {
|
|
for link := range backlinks {
|
|
out <- link
|
|
}
|
|
}
|
|
close(out)
|
|
}()
|
|
return sorted
|
|
}
|
|
|
|
var backlinkConveyor = make(chan backlinkIndexOperation) // No need to buffer because these operations are rare.
|
|
|
|
// RunBacklinksConveyor runs an index operation processing loop. Call it somewhere in main.
|
|
func RunBacklinksConveyor() {
|
|
// It is supposed to run as a goroutine for all the time. So, don't blame the infinite loop.
|
|
defer close(backlinkConveyor)
|
|
for {
|
|
(<-backlinkConveyor).apply()
|
|
}
|
|
}
|
|
|
|
var ZeroTime time.Time
|
|
|
|
type Metadata struct {
|
|
Outlinks []string
|
|
Bytes int
|
|
Words int
|
|
Updated time.Time
|
|
}
|
|
|
|
var backlinkIndex = make(map[string]linkSet)
|
|
var forwardIndex = make(map[string]Metadata)
|
|
var invertedIndex = make(map[string]map[string]int)
|
|
|
|
func scrubInvertedIndexEntry(hyphaName string, tokens []string) {
|
|
for _, token := range tokens {
|
|
if tmap, exists := invertedIndex[token]; exists {
|
|
delete(tmap, hyphaName)
|
|
}
|
|
}
|
|
}
|
|
|
|
func writeTokensToInvertedIndex(hyphaName string, tokens []string) {
|
|
for _, token := range tokens {
|
|
if tmap, exists := invertedIndex[token]; exists {
|
|
tmap[hyphaName] += 1
|
|
} else {
|
|
tmap := make(map[string]int)
|
|
tmap[hyphaName] = 1
|
|
invertedIndex[token] = tmap
|
|
}
|
|
}
|
|
}
|
|
|
|
func containsAlnum(s string) bool {
|
|
for _, c := range s {
|
|
if unicode.IsLetter(c) || unicode.IsNumber(c) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func tokenize(s string) []string {
|
|
eng := porter2.Stemmer
|
|
|
|
tokens := make([]string, 0)
|
|
state := -1
|
|
remainder := s
|
|
var c string
|
|
for len(remainder) > 0 {
|
|
c, remainder, state = uniseg.FirstWordInString(remainder, state)
|
|
if containsAlnum(c) {
|
|
token := eng.Stem(strings.ToLower(c))
|
|
tokens = append(tokens, token)
|
|
}
|
|
}
|
|
|
|
return tokens
|
|
}
|
|
|
|
func generateMetadata(h hyphae.Hypha, content string) {
|
|
tokens := tokenize(content)
|
|
|
|
meta := Metadata{
|
|
Outlinks: extractHyphaLinksFromContent(h.CanonicalName(), content),
|
|
Bytes: len(content),
|
|
Words: len(tokens),
|
|
}
|
|
forwardIndex[h.CanonicalName()] = meta
|
|
}
|
|
|
|
func updateRevTimestamp(h hyphae.Hypha, newTime time.Time) {
|
|
if _, exists := forwardIndex[h.CanonicalName()]; exists {
|
|
// ??? Golang ?????
|
|
meta := forwardIndex[h.CanonicalName()]
|
|
meta.Updated = newTime
|
|
forwardIndex[h.CanonicalName()] = meta
|
|
}
|
|
}
|
|
|
|
// IndexBacklinks traverses all text hyphae, extracts links from them and forms an initial index. Call it when indexing and reindexing hyphae.
|
|
func IndexBacklinks() {
|
|
// It is safe to ignore the mutex, because there is only one worker.
|
|
for h := range hyphae.YieldExistingHyphae() {
|
|
content := fetchText(h)
|
|
foundLinks := extractHyphaLinksFromContent(h.CanonicalName(), content)
|
|
for _, link := range foundLinks {
|
|
if _, exists := backlinkIndex[link]; !exists {
|
|
backlinkIndex[link] = make(linkSet)
|
|
}
|
|
backlinkIndex[link][h.CanonicalName()] = struct{}{}
|
|
}
|
|
generateMetadata(h, content)
|
|
if revs, err := history.Revisions(h.CanonicalName()); err == nil {
|
|
// sorted newest first
|
|
if len(revs) > 0 {
|
|
updateRevTimestamp(h, revs[0].Time)
|
|
}
|
|
}
|
|
|
|
writeTokensToInvertedIndex(h.CanonicalName(), tokenize(util.BeautifulName(h.CanonicalName())))
|
|
writeTokensToInvertedIndex(h.CanonicalName(), tokenize(content))
|
|
}
|
|
}
|
|
|
|
func Search(query string) []string {
|
|
tokens := tokenize(query)
|
|
result := make(map[string]int)
|
|
for _, token := range tokens {
|
|
if documents, exists := invertedIndex[token]; exists {
|
|
for name, termFrequency := range documents {
|
|
result[name] += termFrequency
|
|
}
|
|
}
|
|
}
|
|
// TODO: actually use the tf
|
|
sortedResult := make([]string, 0)
|
|
for name, _ := range result {
|
|
sortedResult = append(sortedResult, name)
|
|
}
|
|
sort.Strings(sortedResult)
|
|
return sortedResult
|
|
}
|
|
|
|
// BacklinksCount returns the amount of backlinks to the hypha. Pass canonical names.
|
|
func BacklinksCount(hyphaName string) int {
|
|
if links, exists := backlinkIndex[hyphaName]; exists {
|
|
return len(links)
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func BacklinksFor(hyphaName string) []string {
|
|
var backlinks []string
|
|
for b := range yieldHyphaBacklinks(hyphaName) {
|
|
backlinks = append(backlinks, b)
|
|
}
|
|
return backlinks
|
|
}
|
|
|
|
func MetadataFor(hyphaName string) Metadata {
|
|
return forwardIndex[hyphaName]
|
|
}
|
|
|
|
func Orphans() []string {
|
|
var orphans []string
|
|
for h := range hyphae.YieldExistingHyphae() {
|
|
if BacklinksCount(h.CanonicalName()) == 0 {
|
|
orphans = append(orphans, h.CanonicalName())
|
|
}
|
|
}
|
|
sort.Strings(orphans)
|
|
return orphans
|
|
}
|
|
|
|
// Using set here seems like the most appropriate solution
|
|
type linkSet map[string]struct{}
|
|
|
|
func toLinkSet(xs []string) linkSet {
|
|
result := make(linkSet)
|
|
for _, x := range xs {
|
|
result[x] = struct{}{}
|
|
}
|
|
return result
|
|
}
|
|
|
|
func fetchText(h hyphae.Hypha) string {
|
|
var path string
|
|
switch h := h.(type) {
|
|
case *hyphae.EmptyHypha:
|
|
return ""
|
|
case *hyphae.TextualHypha:
|
|
path = h.TextFilePath()
|
|
case *hyphae.MediaHypha:
|
|
if !h.HasTextFile() {
|
|
return ""
|
|
}
|
|
path = h.TextFilePath()
|
|
}
|
|
|
|
text, err := os.ReadFile(path)
|
|
if err != nil {
|
|
slog.Error("Failed to read file", "path", path, "err", err, "hyphaName", h.CanonicalName())
|
|
return ""
|
|
}
|
|
return string(text)
|
|
}
|
|
|
|
// backlinkIndexOperation is an operation for the backlink index. This operation is executed async-safe.
|
|
type backlinkIndexOperation interface {
|
|
apply()
|
|
}
|
|
|
|
// backlinkIndexEdit contains data for backlink index update after a hypha edit
|
|
type backlinkIndexEdit struct {
|
|
name string
|
|
oldLinks []string
|
|
newLinks []string
|
|
content string
|
|
oldContent string
|
|
}
|
|
|
|
// apply changes backlink index respective to the operation data
|
|
func (op backlinkIndexEdit) apply() {
|
|
oldLinks := toLinkSet(op.oldLinks)
|
|
newLinks := toLinkSet(op.newLinks)
|
|
for link := range oldLinks {
|
|
if _, exists := newLinks[link]; !exists {
|
|
delete(backlinkIndex[link], op.name)
|
|
}
|
|
}
|
|
for link := range newLinks {
|
|
if _, exists := oldLinks[link]; !exists {
|
|
if _, exists := backlinkIndex[link]; !exists {
|
|
backlinkIndex[link] = make(linkSet)
|
|
}
|
|
backlinkIndex[link][op.name] = struct{}{}
|
|
}
|
|
}
|
|
hyp := hyphae.ByName(op.name)
|
|
generateMetadata(hyp, op.content)
|
|
// wrong, but close enough
|
|
updateRevTimestamp(hyp, time.Now())
|
|
|
|
scrubInvertedIndexEntry(op.name, tokenize(op.oldContent))
|
|
writeTokensToInvertedIndex(op.name, tokenize(op.content))
|
|
}
|
|
|
|
// backlinkIndexDeletion contains data for backlink index update after a hypha deletion
|
|
type backlinkIndexDeletion struct {
|
|
name string
|
|
links []string
|
|
content string
|
|
}
|
|
|
|
// apply changes backlink index respective to the operation data
|
|
func (op backlinkIndexDeletion) apply() {
|
|
for _, link := range op.links {
|
|
if lSet, exists := backlinkIndex[link]; exists {
|
|
delete(lSet, op.name)
|
|
}
|
|
}
|
|
delete(forwardIndex, op.name)
|
|
|
|
scrubInvertedIndexEntry(op.name, tokenize(op.content))
|
|
scrubInvertedIndexEntry(op.name, tokenize(util.BeautifulName(op.name)))
|
|
}
|
|
|
|
// backlinkIndexRenaming contains data for backlink index update after a hypha renaming
|
|
type backlinkIndexRenaming struct {
|
|
oldName string
|
|
newName string
|
|
links []string
|
|
content string
|
|
}
|
|
|
|
// apply changes backlink index respective to the operation data
|
|
func (op backlinkIndexRenaming) apply() {
|
|
for _, link := range op.links {
|
|
if lSet, exists := backlinkIndex[link]; exists {
|
|
delete(lSet, op.oldName)
|
|
backlinkIndex[link][op.newName] = struct{}{}
|
|
}
|
|
}
|
|
|
|
scrubInvertedIndexEntry(op.oldName, tokenize(op.content))
|
|
scrubInvertedIndexEntry(op.oldName, tokenize(util.BeautifulName(op.oldName)))
|
|
writeTokensToInvertedIndex(op.newName, tokenize(op.content))
|
|
writeTokensToInvertedIndex(op.newName, tokenize(util.BeautifulName(op.newName)))
|
|
}
|