477 lines
13 KiB
Go
477 lines
13 KiB
Go
package bleve
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"iter"
|
|
"slices"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"fiatjaf.com/nostr"
|
|
"fiatjaf.com/nostr/eventstore"
|
|
"fiatjaf.com/nostr/nip27"
|
|
"fiatjaf.com/nostr/nip73"
|
|
"fiatjaf.com/nostr/sdk"
|
|
bleve "github.com/blevesearch/bleve/v2"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/analyzer/simple"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/ar"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/cjk"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/da"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/de"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/en"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/es"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/fa"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/fi"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/fr"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/gl"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/hi"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/hr"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/hu"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/in"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/it"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/nl"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/no"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/pl"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/pt"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/ro"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/ru"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/sv"
|
|
_ "github.com/blevesearch/bleve/v2/analysis/lang/tr"
|
|
bleveMapping "github.com/blevesearch/bleve/v2/mapping"
|
|
bleveQuery "github.com/blevesearch/bleve/v2/search/query"
|
|
"github.com/pemistahl/lingua-go"
|
|
)
|
|
|
|
const (
|
|
labelContentField = "c"
|
|
labelKindField = "k"
|
|
labelCreatedAtField = "a"
|
|
labelAuthorField = "p"
|
|
labelReferencesField = "r"
|
|
labelExtrasField = "x"
|
|
)
|
|
|
|
var SupportedLanguages = []lingua.Language{
|
|
// each of these translates to a specific bleve analyzer
|
|
// except for japanese-korean-chinese that all use the same "cjk" analyzer
|
|
lingua.Arabic,
|
|
lingua.Chinese,
|
|
lingua.Croatian,
|
|
lingua.Danish,
|
|
lingua.Dutch,
|
|
lingua.English,
|
|
lingua.Finnish,
|
|
lingua.French,
|
|
lingua.German,
|
|
lingua.Hindi,
|
|
lingua.Hungarian,
|
|
lingua.Italian,
|
|
lingua.Japanese,
|
|
lingua.Korean,
|
|
lingua.Persian,
|
|
lingua.Polish,
|
|
lingua.Portuguese,
|
|
lingua.Romanian,
|
|
lingua.Russian,
|
|
lingua.Spanish,
|
|
lingua.Swedish,
|
|
lingua.Turkish,
|
|
}
|
|
|
|
type BleveBackend struct {
|
|
sync.Mutex
|
|
Path string
|
|
RawEventStore eventstore.Store
|
|
ReadOnly bool
|
|
OpenTimeout time.Duration
|
|
|
|
IndexableKinds []nostr.Kind
|
|
|
|
Languages []lingua.Language
|
|
languageCodes []string
|
|
|
|
index bleve.Index
|
|
detector lingua.LanguageDetector
|
|
}
|
|
|
|
func (b *BleveBackend) Init() error {
|
|
if b.Path == "" {
|
|
return fmt.Errorf("missing Path")
|
|
}
|
|
if b.RawEventStore == nil {
|
|
return fmt.Errorf("missing RawEventStore")
|
|
}
|
|
if len(b.Languages) == 0 {
|
|
return fmt.Errorf("missing Languages")
|
|
}
|
|
if len(b.IndexableKinds) == 0 {
|
|
b.IndexableKinds = []nostr.Kind{0, 1, 6, 11, 16, 20, 21, 22, 24, 1111, 9802, 30023, 30818}
|
|
}
|
|
|
|
validLanguages := make([]lingua.Language, 0, len(b.Languages))
|
|
b.languageCodes = make([]string, 0, len(b.Languages))
|
|
for _, lang := range b.Languages {
|
|
var code string
|
|
|
|
switch lang {
|
|
case lingua.Chinese, lingua.Korean, lingua.Japanese:
|
|
code = "cjk"
|
|
default:
|
|
code = strings.ToLower(lang.IsoCode639_1().String())
|
|
}
|
|
|
|
if slices.Contains(b.languageCodes, code) {
|
|
continue
|
|
}
|
|
|
|
validLanguages = append(validLanguages, lang)
|
|
b.languageCodes = append(b.languageCodes, code)
|
|
}
|
|
b.Languages = validLanguages
|
|
|
|
opts := map[string]any{
|
|
"read_only": b.ReadOnly,
|
|
}
|
|
if b.OpenTimeout != 0 {
|
|
opts["bolt_timeout"] = b.OpenTimeout.String()
|
|
}
|
|
|
|
index, err := bleve.OpenUsing(b.Path, opts)
|
|
if err == bleve.ErrorIndexPathDoesNotExist {
|
|
mapping := bleveMapping.NewIndexMapping()
|
|
mapping.DefaultMapping.Dynamic = false
|
|
doc := bleveMapping.NewDocumentStaticMapping()
|
|
|
|
for _, code := range b.languageCodes {
|
|
contentField := bleveMapping.NewTextFieldMapping()
|
|
contentField.Analyzer = code
|
|
contentField.Store = false
|
|
contentField.IncludeTermVectors = false
|
|
contentField.DocValues = false
|
|
contentField.IncludeInAll = false
|
|
doc.AddFieldMappingsAt(labelContentField+"_"+code, contentField)
|
|
}
|
|
|
|
extrasField := bleveMapping.NewTextFieldMapping()
|
|
extrasField.Analyzer = "simple"
|
|
extrasField.Store = false
|
|
extrasField.IncludeTermVectors = false
|
|
extrasField.DocValues = false
|
|
extrasField.IncludeInAll = false
|
|
doc.AddFieldMappingsAt(labelExtrasField, extrasField)
|
|
|
|
referencesField := bleveMapping.NewKeywordFieldMapping()
|
|
referencesField.DocValues = false
|
|
referencesField.Store = false
|
|
referencesField.IncludeTermVectors = false
|
|
referencesField.IncludeInAll = false
|
|
doc.AddFieldMappingsAt(labelReferencesField, referencesField)
|
|
|
|
authorField := bleveMapping.NewKeywordFieldMapping()
|
|
authorField.DocValues = false
|
|
authorField.Store = false
|
|
authorField.IncludeTermVectors = false
|
|
doc.AddFieldMappingsAt(labelAuthorField, authorField)
|
|
|
|
kindField := bleveMapping.NewKeywordFieldMapping()
|
|
kindField.DocValues = false
|
|
kindField.Store = false
|
|
kindField.IncludeTermVectors = false
|
|
kindField.IncludeInAll = false
|
|
doc.AddFieldMappingsAt(labelKindField, kindField)
|
|
|
|
timestampField := bleveMapping.NewDateTimeFieldMapping()
|
|
timestampField.DocValues = false
|
|
timestampField.Store = false
|
|
timestampField.IncludeTermVectors = false
|
|
timestampField.IncludeInAll = false
|
|
doc.AddFieldMappingsAt(labelCreatedAtField, timestampField)
|
|
|
|
mapping.AddDocumentMapping("_default", doc)
|
|
|
|
index, err = bleve.New(b.Path, mapping)
|
|
if err != nil {
|
|
return fmt.Errorf("error creating index: %w", err)
|
|
}
|
|
} else if err != nil {
|
|
return fmt.Errorf("error opening index: %w", err)
|
|
}
|
|
|
|
b.index = index
|
|
b.detector = lingua.NewLanguageDetectorBuilder().
|
|
FromLanguages(b.Languages...).
|
|
Build()
|
|
|
|
return nil
|
|
}
|
|
|
|
func (b *BleveBackend) Close() {
|
|
if b != nil && b.index != nil {
|
|
b.index.Close()
|
|
}
|
|
}
|
|
|
|
func (b *BleveBackend) SaveEvent(event nostr.Event) error {
|
|
if slices.Contains(b.IndexableKinds, event.Kind) {
|
|
return b.indexEvent(event)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (b *BleveBackend) DeleteEvent(id nostr.ID) error {
|
|
if b != nil && b.index != nil {
|
|
return b.index.Delete(id.Hex())
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (b *BleveBackend) indexEvent(evt nostr.Event) error {
|
|
docID := evt.ID
|
|
|
|
var references []string
|
|
var extras string
|
|
|
|
switch evt.Kind {
|
|
case 6, 16:
|
|
var innerEvt nostr.Event
|
|
if err := json.Unmarshal([]byte(evt.Content), &innerEvt); err != nil || !innerEvt.VerifySignature() {
|
|
return nil
|
|
}
|
|
evt = innerEvt
|
|
case 0:
|
|
var pm sdk.ProfileMetadata
|
|
if err := json.Unmarshal([]byte(evt.Content), &pm); err == nil {
|
|
evt.Content = pm.Name + "\n" + pm.DisplayName + "\n" + pm.About
|
|
references = append(references, pm.NIP05)
|
|
}
|
|
}
|
|
|
|
for _, tag := range evt.Tags {
|
|
if len(tag) < 2 {
|
|
continue
|
|
}
|
|
switch tag[0] {
|
|
case "comment", "name", "title", "about", "description":
|
|
evt.Content += "\n\n" + tag[1]
|
|
case "e":
|
|
if ptr, err := nostr.EventPointerFromTag(tag); err == nil {
|
|
references = append(references, ptr.AsTagReference())
|
|
}
|
|
case "a":
|
|
if ptr, err := nostr.EntityPointerFromTag(tag); err == nil {
|
|
references = append(references, ptr.AsTagReference())
|
|
}
|
|
case "r":
|
|
references = append(references, tag[1])
|
|
}
|
|
}
|
|
|
|
doc := map[string]any{
|
|
labelKindField: strconv.Itoa(int(evt.Kind)),
|
|
labelAuthorField: evt.PubKey.Hex()[56:],
|
|
labelCreatedAtField: evt.CreatedAt.Time(),
|
|
}
|
|
|
|
content := strings.Builder{}
|
|
content.Grow(len(evt.Content))
|
|
|
|
for block := range nip27.Parse(evt.Content) {
|
|
if block.Pointer == nil {
|
|
content.WriteString(strings.TrimSpace(block.Text))
|
|
} else {
|
|
references = append(references, block.Pointer.AsTagReference())
|
|
if ep, ok := block.Pointer.(nip73.ExternalPointer); ok {
|
|
extras += ep.Thing + " "
|
|
}
|
|
}
|
|
}
|
|
|
|
indexableContent := content.String()
|
|
lang, ok := b.detector.DetectLanguageOf(indexableContent)
|
|
if !ok {
|
|
lang = lingua.English
|
|
}
|
|
|
|
var analyzerLangCode string
|
|
switch lang {
|
|
case lingua.Japanese, lingua.Chinese, lingua.Korean:
|
|
analyzerLangCode = "cjk"
|
|
default:
|
|
analyzerLangCode = strings.ToLower(lang.IsoCode639_1().String())
|
|
}
|
|
doc[labelContentField+"_"+analyzerLangCode] = indexableContent
|
|
|
|
doc[labelReferencesField] = references
|
|
doc[labelExtrasField] = extras
|
|
|
|
if err := b.index.Index(docID.Hex(), doc); err != nil {
|
|
return fmt.Errorf("failed to index '%s' document: %w", docID.Hex(), err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (b *BleveBackend) CountEvents(filter nostr.Filter) (uint32, error) {
|
|
if filter.String() == "{}" {
|
|
count, err := b.index.DocCount()
|
|
return uint32(count), err
|
|
}
|
|
|
|
return 0, errors.New("not supported")
|
|
}
|
|
|
|
func (b *BleveBackend) QueryEvents(filter nostr.Filter, maxLimit int) iter.Seq[nostr.Event] {
|
|
return func(yield func(nostr.Event) bool) {
|
|
if tlimit := filter.GetTheoreticalLimit(); tlimit == 0 {
|
|
return
|
|
} else if tlimit < maxLimit {
|
|
maxLimit = tlimit
|
|
}
|
|
|
|
filter.Search = strings.TrimSpace(filter.Search)
|
|
if len(filter.Search) < 2 {
|
|
return
|
|
}
|
|
|
|
and := make([]bleveQuery.Query, 0, 3)
|
|
|
|
searchC := strings.Builder{}
|
|
searchC.Grow(len(filter.Search))
|
|
|
|
for block := range nip27.Parse(filter.Search) {
|
|
if block.Pointer != nil {
|
|
genericRef := bleve.NewTermQuery(block.Pointer.AsTagReference())
|
|
genericRef.SetField(labelReferencesField)
|
|
genericRef.SetBoost(2)
|
|
|
|
var ref bleveQuery.Query = genericRef
|
|
if profile, ok := block.Pointer.(nostr.ProfilePointer); ok {
|
|
authorQuery := bleve.NewTermQuery(profile.PublicKey.Hex()[56:])
|
|
authorQuery.SetField(labelAuthorField)
|
|
authorQuery.SetBoost(2)
|
|
orRef := bleve.NewDisjunctionQuery()
|
|
orRef.AddQuery(genericRef)
|
|
orRef.AddQuery(authorQuery)
|
|
ref = orRef
|
|
} else if addr, ok := block.Pointer.(nostr.EntityPointer); ok {
|
|
authorQuery := bleve.NewTermQuery(addr.PublicKey.Hex()[56:])
|
|
authorQuery.SetField(labelAuthorField)
|
|
authorQuery.SetBoost(2)
|
|
orRef := bleve.NewDisjunctionQuery()
|
|
orRef.AddQuery(genericRef)
|
|
orRef.AddQuery(authorQuery)
|
|
ref = orRef
|
|
}
|
|
and = append(and, ref)
|
|
} else {
|
|
searchC.WriteString(strings.TrimSpace(block.Text))
|
|
}
|
|
}
|
|
|
|
searchContent := searchC.String()
|
|
|
|
var exactMatches []string
|
|
if len(searchContent) > 0 {
|
|
contentQueries := make([]bleveQuery.Query, 0, len(b.Languages)+1)
|
|
|
|
searchQ, exactMatches_, err := parse(searchContent, labelContentField+"_"+b.languageCodes[0])
|
|
if err != nil {
|
|
for _, code := range b.languageCodes {
|
|
match := bleve.NewMatchQuery(searchContent)
|
|
match.SetField(labelContentField + "_" + code)
|
|
contentQueries = append(contentQueries, match)
|
|
}
|
|
} else {
|
|
contentQueries = append(contentQueries, searchQ)
|
|
for _, code := range b.languageCodes[1:] {
|
|
searchQ, _, _ := parse(searchContent, labelContentField+"_"+code)
|
|
contentQueries = append(contentQueries, searchQ)
|
|
}
|
|
}
|
|
exactMatches = exactMatches_
|
|
|
|
extrasQ := bleve.NewMatchQuery(searchContent)
|
|
extrasQ.SetField(labelExtrasField)
|
|
contentQueries = append(contentQueries, extrasQ)
|
|
|
|
and = append(and, bleveQuery.NewDisjunctionQuery(contentQueries))
|
|
}
|
|
|
|
if len(filter.Kinds) > 0 {
|
|
eitherKind := bleve.NewDisjunctionQuery()
|
|
for _, kind := range filter.Kinds {
|
|
kindQ := bleve.NewTermQuery(strconv.Itoa(int(kind)))
|
|
kindQ.SetField(labelKindField)
|
|
eitherKind.AddQuery(kindQ)
|
|
}
|
|
and = append(and, eitherKind)
|
|
}
|
|
|
|
if len(filter.Authors) > 0 {
|
|
eitherPubkey := bleve.NewDisjunctionQuery()
|
|
for _, pubkey := range filter.Authors {
|
|
pubkeyQ := bleve.NewTermQuery(pubkey.Hex()[56:])
|
|
pubkeyQ.SetField(labelAuthorField)
|
|
eitherPubkey.AddQuery(pubkeyQ)
|
|
}
|
|
and = append(and, eitherPubkey)
|
|
}
|
|
|
|
if filter.Since != 0 || filter.Until != 0 {
|
|
var min time.Time
|
|
if filter.Since != 0 {
|
|
min = filter.Since.Time()
|
|
}
|
|
var max time.Time
|
|
if filter.Until != 0 {
|
|
max = filter.Until.Time()
|
|
} else {
|
|
max = time.Now()
|
|
}
|
|
dateRangeQ := bleve.NewDateRangeQuery(min, max)
|
|
dateRangeQ.SetField(labelCreatedAtField)
|
|
and = append(and, dateRangeQ)
|
|
}
|
|
|
|
q := bleveQuery.NewConjunctionQuery(and)
|
|
req := bleve.NewSearchRequest(q)
|
|
req.Size = maxLimit
|
|
req.From = 0
|
|
req.Explain = true
|
|
|
|
result, err := b.index.Search(req)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
resultHit:
|
|
for _, hit := range result.Hits {
|
|
id, err := nostr.IDFromHex(hit.ID)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for evt := range b.RawEventStore.QueryEvents(nostr.Filter{IDs: []nostr.ID{id}}, 1) {
|
|
for _, exactMatch := range exactMatches {
|
|
if !strings.Contains(strings.ToLower(evt.Content), exactMatch) {
|
|
continue resultHit
|
|
}
|
|
}
|
|
|
|
for f, v := range filter.Tags {
|
|
if !evt.Tags.ContainsAny(f, v) {
|
|
continue resultHit
|
|
}
|
|
}
|
|
|
|
if !yield(evt) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|