1
0
mirror of https://github.com/osmarks/mycorrhiza.git synced 2025-03-10 13:38:20 +00:00
2025-03-07 10:26:46 +00:00

318 lines
8.0 KiB
Go

// Package backlinks maintains the index of backlinks and lets you update it and query it.
package backlinks
import (
"log/slog"
"os"
"sort"
"time"
"strings"
"unicode"
"github.com/bouncepaw/mycorrhiza/util"
"github.com/bouncepaw/mycorrhiza/internal/hyphae"
"github.com/bouncepaw/mycorrhiza/history"
"github.com/rivo/uniseg"
"github.com/dchest/stemmer/porter2"
)
// yieldHyphaBacklinks gets backlinks for the desired hypha, sorts and yields them one by one.
func yieldHyphaBacklinks(hyphaName string) <-chan string {
hyphaName = util.CanonicalName(hyphaName)
out := make(chan string)
sorted := hyphae.PathographicSort(out)
go func() {
backlinks, exists := backlinkIndex[hyphaName]
if exists {
for link := range backlinks {
out <- link
}
}
close(out)
}()
return sorted
}
var backlinkConveyor = make(chan backlinkIndexOperation) // No need to buffer because these operations are rare.
// RunBacklinksConveyor runs an index operation processing loop. Call it somewhere in main.
func RunBacklinksConveyor() {
// It is supposed to run as a goroutine for all the time. So, don't blame the infinite loop.
defer close(backlinkConveyor)
for {
(<-backlinkConveyor).apply()
}
}
var ZeroTime time.Time
type Metadata struct {
Outlinks []string
Bytes int
Words int
Updated time.Time
}
var backlinkIndex = make(map[string]linkSet)
var forwardIndex = make(map[string]Metadata)
var invertedIndex = make(map[string]map[string]int)
func scrubInvertedIndexEntry(hyphaName string, tokens []string) {
for _, token := range tokens {
if tmap, exists := invertedIndex[token]; exists {
delete(tmap, hyphaName)
}
}
}
func writeTokensToInvertedIndex(hyphaName string, tokens []string) {
for _, token := range tokens {
if tmap, exists := invertedIndex[token]; exists {
tmap[hyphaName] += 1
} else {
tmap := make(map[string]int)
tmap[hyphaName] = 1
invertedIndex[token] = tmap
}
}
}
func containsAlnum(s string) bool {
for _, c := range s {
if unicode.IsLetter(c) || unicode.IsNumber(c) {
return true
}
}
return false
}
func tokenize(s string) []string {
eng := porter2.Stemmer
tokens := make([]string, 0)
state := -1
remainder := s
var c string
for len(remainder) > 0 {
c, remainder, state = uniseg.FirstWordInString(remainder, state)
if containsAlnum(c) {
token := eng.Stem(strings.ToLower(c))
tokens = append(tokens, token)
}
}
return tokens
}
func generateMetadata(h hyphae.Hypha, content string) {
tokens := tokenize(content)
meta := Metadata{
Outlinks: extractHyphaLinksFromContent(h.CanonicalName(), content),
Bytes: len(content),
Words: len(tokens),
}
forwardIndex[h.CanonicalName()] = meta
}
func updateRevTimestamp(h hyphae.Hypha, newTime time.Time) {
if _, exists := forwardIndex[h.CanonicalName()]; exists {
// ??? Golang ?????
meta := forwardIndex[h.CanonicalName()]
meta.Updated = newTime
forwardIndex[h.CanonicalName()] = meta
}
}
// IndexBacklinks traverses all text hyphae, extracts links from them and forms an initial index. Call it when indexing and reindexing hyphae.
func IndexBacklinks() {
// It is safe to ignore the mutex, because there is only one worker.
for h := range hyphae.YieldExistingHyphae() {
content := fetchText(h)
foundLinks := extractHyphaLinksFromContent(h.CanonicalName(), content)
for _, link := range foundLinks {
if _, exists := backlinkIndex[link]; !exists {
backlinkIndex[link] = make(linkSet)
}
backlinkIndex[link][h.CanonicalName()] = struct{}{}
}
generateMetadata(h, content)
if revs, err := history.Revisions(h.CanonicalName()); err == nil {
// sorted newest first
if len(revs) > 0 {
updateRevTimestamp(h, revs[0].Time)
}
}
writeTokensToInvertedIndex(h.CanonicalName(), tokenize(util.BeautifulName(h.CanonicalName())))
writeTokensToInvertedIndex(h.CanonicalName(), tokenize(content))
}
}
func Search(query string) []string {
tokens := tokenize(query)
result := make(map[string]int)
for _, token := range tokens {
if documents, exists := invertedIndex[token]; exists {
for name, termFrequency := range documents {
result[name] += termFrequency
}
}
}
// TODO: actually use the tf
sortedResult := make([]string, 0)
for name, _ := range result {
sortedResult = append(sortedResult, name)
}
sort.Strings(sortedResult)
return sortedResult
}
// BacklinksCount returns the amount of backlinks to the hypha. Pass canonical names.
func BacklinksCount(hyphaName string) int {
if links, exists := backlinkIndex[hyphaName]; exists {
return len(links)
}
return 0
}
func BacklinksFor(hyphaName string) []string {
var backlinks []string
for b := range yieldHyphaBacklinks(hyphaName) {
backlinks = append(backlinks, b)
}
return backlinks
}
func MetadataFor(hyphaName string) Metadata {
return forwardIndex[hyphaName]
}
func Orphans() []string {
var orphans []string
for h := range hyphae.YieldExistingHyphae() {
if BacklinksCount(h.CanonicalName()) == 0 {
orphans = append(orphans, h.CanonicalName())
}
}
sort.Strings(orphans)
return orphans
}
// Using set here seems like the most appropriate solution
type linkSet map[string]struct{}
func toLinkSet(xs []string) linkSet {
result := make(linkSet)
for _, x := range xs {
result[x] = struct{}{}
}
return result
}
func fetchText(h hyphae.Hypha) string {
var path string
switch h := h.(type) {
case *hyphae.EmptyHypha:
return ""
case *hyphae.TextualHypha:
path = h.TextFilePath()
case *hyphae.MediaHypha:
if !h.HasTextFile() {
return ""
}
path = h.TextFilePath()
}
text, err := os.ReadFile(path)
if err != nil {
slog.Error("Failed to read file", "path", path, "err", err, "hyphaName", h.CanonicalName())
return ""
}
return string(text)
}
// backlinkIndexOperation is an operation for the backlink index. This operation is executed async-safe.
type backlinkIndexOperation interface {
apply()
}
// backlinkIndexEdit contains data for backlink index update after a hypha edit
type backlinkIndexEdit struct {
name string
oldLinks []string
newLinks []string
content string
oldContent string
}
// apply changes backlink index respective to the operation data
func (op backlinkIndexEdit) apply() {
oldLinks := toLinkSet(op.oldLinks)
newLinks := toLinkSet(op.newLinks)
for link := range oldLinks {
if _, exists := newLinks[link]; !exists {
delete(backlinkIndex[link], op.name)
}
}
for link := range newLinks {
if _, exists := oldLinks[link]; !exists {
if _, exists := backlinkIndex[link]; !exists {
backlinkIndex[link] = make(linkSet)
}
backlinkIndex[link][op.name] = struct{}{}
}
}
hyp := hyphae.ByName(op.name)
generateMetadata(hyp, op.content)
// wrong, but close enough
updateRevTimestamp(hyp, time.Now())
scrubInvertedIndexEntry(op.name, tokenize(op.oldContent))
writeTokensToInvertedIndex(op.name, tokenize(op.content))
}
// backlinkIndexDeletion contains data for backlink index update after a hypha deletion
type backlinkIndexDeletion struct {
name string
links []string
content string
}
// apply changes backlink index respective to the operation data
func (op backlinkIndexDeletion) apply() {
for _, link := range op.links {
if lSet, exists := backlinkIndex[link]; exists {
delete(lSet, op.name)
}
}
delete(forwardIndex, op.name)
scrubInvertedIndexEntry(op.name, tokenize(op.content))
scrubInvertedIndexEntry(op.name, tokenize(util.BeautifulName(op.name)))
}
// backlinkIndexRenaming contains data for backlink index update after a hypha renaming
type backlinkIndexRenaming struct {
oldName string
newName string
links []string
content string
}
// apply changes backlink index respective to the operation data
func (op backlinkIndexRenaming) apply() {
for _, link := range op.links {
if lSet, exists := backlinkIndex[link]; exists {
delete(lSet, op.oldName)
backlinkIndex[link][op.newName] = struct{}{}
}
}
scrubInvertedIndexEntry(op.oldName, tokenize(op.content))
scrubInvertedIndexEntry(op.oldName, tokenize(util.BeautifulName(op.oldName)))
writeTokensToInvertedIndex(op.newName, tokenize(op.content))
writeTokensToInvertedIndex(op.newName, tokenize(util.BeautifulName(op.newName)))
}