Files

487 lines
13 KiB
Go

package bleve
import (
"encoding/json"
"errors"
"fmt"
"iter"
"slices"
"strconv"
"strings"
"sync"
"time"
"fiatjaf.com/nostr"
"fiatjaf.com/nostr/eventstore"
"fiatjaf.com/nostr/nip27"
"fiatjaf.com/nostr/nip73"
"fiatjaf.com/nostr/sdk"
bleve "github.com/blevesearch/bleve/v2"
_ "github.com/blevesearch/bleve/v2/analysis/analyzer/simple"
_ "github.com/blevesearch/bleve/v2/analysis/lang/ar"
_ "github.com/blevesearch/bleve/v2/analysis/lang/cjk"
_ "github.com/blevesearch/bleve/v2/analysis/lang/da"
_ "github.com/blevesearch/bleve/v2/analysis/lang/de"
_ "github.com/blevesearch/bleve/v2/analysis/lang/en"
_ "github.com/blevesearch/bleve/v2/analysis/lang/es"
_ "github.com/blevesearch/bleve/v2/analysis/lang/fa"
_ "github.com/blevesearch/bleve/v2/analysis/lang/fi"
_ "github.com/blevesearch/bleve/v2/analysis/lang/fr"
_ "github.com/blevesearch/bleve/v2/analysis/lang/gl"
_ "github.com/blevesearch/bleve/v2/analysis/lang/hi"
_ "github.com/blevesearch/bleve/v2/analysis/lang/hr"
_ "github.com/blevesearch/bleve/v2/analysis/lang/hu"
_ "github.com/blevesearch/bleve/v2/analysis/lang/in"
_ "github.com/blevesearch/bleve/v2/analysis/lang/it"
_ "github.com/blevesearch/bleve/v2/analysis/lang/nl"
_ "github.com/blevesearch/bleve/v2/analysis/lang/no"
_ "github.com/blevesearch/bleve/v2/analysis/lang/pl"
_ "github.com/blevesearch/bleve/v2/analysis/lang/pt"
_ "github.com/blevesearch/bleve/v2/analysis/lang/ro"
_ "github.com/blevesearch/bleve/v2/analysis/lang/ru"
_ "github.com/blevesearch/bleve/v2/analysis/lang/sv"
_ "github.com/blevesearch/bleve/v2/analysis/lang/tr"
bleveMapping "github.com/blevesearch/bleve/v2/mapping"
bleveQuery "github.com/blevesearch/bleve/v2/search/query"
"github.com/pemistahl/lingua-go"
)
const (
labelContentField = "c"
labelKindField = "k"
labelCreatedAtField = "a"
labelAuthorField = "p"
labelReferencesField = "r"
labelExtrasField = "x"
)
var SupportedLanguages = []lingua.Language{
// each of these translates to a specific bleve analyzer
// except for japanese-korean-chinese that all use the same "cjk" analyzer
lingua.Arabic,
lingua.Chinese,
lingua.Croatian,
lingua.Danish,
lingua.Dutch,
lingua.English,
lingua.Finnish,
lingua.French,
lingua.German,
lingua.Hindi,
lingua.Hungarian,
lingua.Italian,
lingua.Japanese,
lingua.Korean,
lingua.Persian,
lingua.Polish,
lingua.Portuguese,
lingua.Romanian,
lingua.Russian,
lingua.Spanish,
lingua.Swedish,
lingua.Turkish,
}
type BleveBackend struct {
sync.Mutex
Path string
RawEventStore eventstore.Store
ReadOnly bool
OpenTimeout time.Duration
IndexableKinds []nostr.Kind
Languages []lingua.Language
languageCodes []string
index bleve.Index
detector lingua.LanguageDetector
}
func (b *BleveBackend) Init() error {
if b.Path == "" {
return fmt.Errorf("missing Path")
}
if b.RawEventStore == nil {
return fmt.Errorf("missing RawEventStore")
}
if len(b.Languages) == 0 {
return fmt.Errorf("missing Languages")
}
if len(b.IndexableKinds) == 0 {
b.IndexableKinds = []nostr.Kind{0, 1, 6, 11, 16, 20, 21, 22, 24, 1111, 9802, 30023, 30818}
}
validLanguages := make([]lingua.Language, 0, len(b.Languages))
b.languageCodes = make([]string, 0, len(b.Languages))
for _, lang := range b.Languages {
var code string
switch lang {
case lingua.Chinese, lingua.Korean, lingua.Japanese:
code = "cjk"
default:
code = strings.ToLower(lang.IsoCode639_1().String())
}
if slices.Contains(b.languageCodes, code) {
continue
}
validLanguages = append(validLanguages, lang)
b.languageCodes = append(b.languageCodes, code)
}
b.Languages = validLanguages
opts := map[string]any{
"read_only": b.ReadOnly,
}
if b.OpenTimeout != 0 {
opts["bolt_timeout"] = b.OpenTimeout.String()
}
index, err := bleve.OpenUsing(b.Path, opts)
if err == bleve.ErrorIndexPathDoesNotExist {
mapping := bleveMapping.NewIndexMapping()
mapping.DefaultMapping.Dynamic = false
doc := bleveMapping.NewDocumentStaticMapping()
for _, code := range b.languageCodes {
contentField := bleveMapping.NewTextFieldMapping()
contentField.Analyzer = code
contentField.Store = false
contentField.IncludeTermVectors = false
contentField.DocValues = false
contentField.IncludeInAll = false
doc.AddFieldMappingsAt(labelContentField+"_"+code, contentField)
}
extrasField := bleveMapping.NewTextFieldMapping()
extrasField.Analyzer = "simple"
extrasField.Store = false
extrasField.IncludeTermVectors = false
extrasField.DocValues = false
extrasField.IncludeInAll = false
doc.AddFieldMappingsAt(labelExtrasField, extrasField)
referencesField := bleveMapping.NewKeywordFieldMapping()
referencesField.DocValues = false
referencesField.Store = false
referencesField.IncludeTermVectors = false
referencesField.IncludeInAll = false
doc.AddFieldMappingsAt(labelReferencesField, referencesField)
authorField := bleveMapping.NewKeywordFieldMapping()
authorField.DocValues = false
authorField.Store = false
authorField.IncludeTermVectors = false
doc.AddFieldMappingsAt(labelAuthorField, authorField)
kindField := bleveMapping.NewKeywordFieldMapping()
kindField.DocValues = false
kindField.Store = false
kindField.IncludeTermVectors = false
kindField.IncludeInAll = false
doc.AddFieldMappingsAt(labelKindField, kindField)
timestampField := bleveMapping.NewDateTimeFieldMapping()
timestampField.DocValues = false
timestampField.Store = false
timestampField.IncludeTermVectors = false
timestampField.IncludeInAll = false
doc.AddFieldMappingsAt(labelCreatedAtField, timestampField)
mapping.AddDocumentMapping("_default", doc)
index, err = bleve.New(b.Path, mapping)
if err != nil {
return fmt.Errorf("error creating index: %w", err)
}
} else if err != nil {
return fmt.Errorf("error opening index: %w", err)
}
b.index = index
if len(b.Languages) >= 2 {
b.detector = lingua.NewLanguageDetectorBuilder().
FromLanguages(b.Languages...).
Build()
}
return nil
}
func (b *BleveBackend) Close() {
if b != nil && b.index != nil {
b.index.Close()
}
}
func (b *BleveBackend) SaveEvent(event nostr.Event) error {
if slices.Contains(b.IndexableKinds, event.Kind) {
return b.indexEvent(event)
}
return nil
}
func (b *BleveBackend) DeleteEvent(id nostr.ID) error {
if b != nil && b.index != nil {
return b.index.Delete(id.Hex())
}
return nil
}
func (b *BleveBackend) indexEvent(evt nostr.Event) error {
docID := evt.ID
var references []string
var extras string
switch evt.Kind {
case 6, 16:
var innerEvt nostr.Event
if err := json.Unmarshal([]byte(evt.Content), &innerEvt); err != nil || !innerEvt.VerifySignature() {
return nil
}
evt = innerEvt
case 0:
var pm sdk.ProfileMetadata
if err := json.Unmarshal([]byte(evt.Content), &pm); err == nil {
evt.Content = pm.Name + "\n" + pm.DisplayName + "\n" + pm.About
references = append(references, pm.NIP05)
}
}
for _, tag := range evt.Tags {
if len(tag) < 2 {
continue
}
switch tag[0] {
case "comment", "name", "title", "about", "description":
evt.Content += "\n\n" + tag[1]
case "e":
if ptr, err := nostr.EventPointerFromTag(tag); err == nil {
references = append(references, ptr.AsTagReference())
}
case "a":
if ptr, err := nostr.EntityPointerFromTag(tag); err == nil {
references = append(references, ptr.AsTagReference())
}
case "r":
references = append(references, tag[1])
}
}
doc := map[string]any{
labelKindField: strconv.Itoa(int(evt.Kind)),
labelAuthorField: evt.PubKey.Hex()[56:],
labelCreatedAtField: evt.CreatedAt.Time(),
}
content := strings.Builder{}
content.Grow(len(evt.Content))
for block := range nip27.Parse(evt.Content) {
if block.Pointer == nil {
content.WriteString(strings.TrimSpace(block.Text))
} else {
references = append(references, block.Pointer.AsTagReference())
if ep, ok := block.Pointer.(nip73.ExternalPointer); ok {
extras += ep.Thing + " "
}
}
}
indexableContent := content.String()
var lang lingua.Language
if len(b.Languages) == 1 {
lang = b.Languages[0]
} else {
var ok bool
lang, ok = b.detector.DetectLanguageOf(indexableContent)
if !ok {
lang = lingua.English
}
}
var analyzerLangCode string
switch lang {
case lingua.Japanese, lingua.Chinese, lingua.Korean:
analyzerLangCode = "cjk"
default:
analyzerLangCode = strings.ToLower(lang.IsoCode639_1().String())
}
doc[labelContentField+"_"+analyzerLangCode] = indexableContent
doc[labelReferencesField] = references
doc[labelExtrasField] = extras
if err := b.index.Index(docID.Hex(), doc); err != nil {
return fmt.Errorf("failed to index '%s' document: %w", docID.Hex(), err)
}
return nil
}
func (b *BleveBackend) CountEvents(filter nostr.Filter) (uint32, error) {
if filter.String() == "{}" {
count, err := b.index.DocCount()
return uint32(count), err
}
return 0, errors.New("not supported")
}
func (b *BleveBackend) QueryEvents(filter nostr.Filter, maxLimit int) iter.Seq[nostr.Event] {
return func(yield func(nostr.Event) bool) {
if tlimit := filter.GetTheoreticalLimit(); tlimit == 0 {
return
} else if tlimit < maxLimit {
maxLimit = tlimit
}
filter.Search = strings.TrimSpace(filter.Search)
if len(filter.Search) < 2 {
return
}
and := make([]bleveQuery.Query, 0, 3)
searchC := strings.Builder{}
searchC.Grow(len(filter.Search))
for block := range nip27.Parse(filter.Search) {
if block.Pointer != nil {
genericRef := bleve.NewTermQuery(block.Pointer.AsTagReference())
genericRef.SetField(labelReferencesField)
genericRef.SetBoost(2)
var ref bleveQuery.Query = genericRef
if profile, ok := block.Pointer.(nostr.ProfilePointer); ok {
authorQuery := bleve.NewTermQuery(profile.PublicKey.Hex()[56:])
authorQuery.SetField(labelAuthorField)
authorQuery.SetBoost(2)
orRef := bleve.NewDisjunctionQuery()
orRef.AddQuery(genericRef)
orRef.AddQuery(authorQuery)
ref = orRef
} else if addr, ok := block.Pointer.(nostr.EntityPointer); ok {
authorQuery := bleve.NewTermQuery(addr.PublicKey.Hex()[56:])
authorQuery.SetField(labelAuthorField)
authorQuery.SetBoost(2)
orRef := bleve.NewDisjunctionQuery()
orRef.AddQuery(genericRef)
orRef.AddQuery(authorQuery)
ref = orRef
}
and = append(and, ref)
} else {
searchC.WriteString(strings.TrimSpace(block.Text))
}
}
searchContent := searchC.String()
var exactMatches []string
if len(searchContent) > 0 {
contentQueries := make([]bleveQuery.Query, 0, len(b.Languages)+1)
searchQ, exactMatches_, err := parse(searchContent, labelContentField+"_"+b.languageCodes[0])
if err != nil {
for _, code := range b.languageCodes {
match := bleve.NewMatchQuery(searchContent)
match.SetField(labelContentField + "_" + code)
contentQueries = append(contentQueries, match)
}
} else {
contentQueries = append(contentQueries, searchQ)
for _, code := range b.languageCodes[1:] {
searchQ, _, _ := parse(searchContent, labelContentField+"_"+code)
contentQueries = append(contentQueries, searchQ)
}
}
exactMatches = exactMatches_
extrasQ := bleve.NewMatchQuery(searchContent)
extrasQ.SetField(labelExtrasField)
contentQueries = append(contentQueries, extrasQ)
and = append(and, bleveQuery.NewDisjunctionQuery(contentQueries))
}
if len(filter.Kinds) > 0 {
eitherKind := bleve.NewDisjunctionQuery()
for _, kind := range filter.Kinds {
kindQ := bleve.NewTermQuery(strconv.Itoa(int(kind)))
kindQ.SetField(labelKindField)
eitherKind.AddQuery(kindQ)
}
and = append(and, eitherKind)
}
if len(filter.Authors) > 0 {
eitherPubkey := bleve.NewDisjunctionQuery()
for _, pubkey := range filter.Authors {
pubkeyQ := bleve.NewTermQuery(pubkey.Hex()[56:])
pubkeyQ.SetField(labelAuthorField)
eitherPubkey.AddQuery(pubkeyQ)
}
and = append(and, eitherPubkey)
}
if filter.Since != 0 || filter.Until != 0 {
var min time.Time
if filter.Since != 0 {
min = filter.Since.Time()
}
var max time.Time
if filter.Until != 0 {
max = filter.Until.Time()
} else {
max = time.Now()
}
dateRangeQ := bleve.NewDateRangeQuery(min, max)
dateRangeQ.SetField(labelCreatedAtField)
and = append(and, dateRangeQ)
}
q := bleveQuery.NewConjunctionQuery(and)
req := bleve.NewSearchRequest(q)
req.Size = maxLimit
req.From = 0
req.Explain = true
result, err := b.index.Search(req)
if err != nil {
return
}
resultHit:
for _, hit := range result.Hits {
id, err := nostr.IDFromHex(hit.ID)
if err != nil {
continue
}
for evt := range b.RawEventStore.QueryEvents(nostr.Filter{IDs: []nostr.ID{id}}, 1) {
for _, exactMatch := range exactMatches {
if !strings.Contains(strings.ToLower(evt.Content), exactMatch) {
continue resultHit
}
}
for f, v := range filter.Tags {
if !evt.Tags.ContainsAny(f, v) {
continue resultHit
}
}
if !yield(evt) {
return
}
}
}
}
}