diff --git a/eventstore/bleve/bleve_test.go b/eventstore/bleve/bleve_test.go index dd6f437..54acf49 100644 --- a/eventstore/bleve/bleve_test.go +++ b/eventstore/bleve/bleve_test.go @@ -7,6 +7,7 @@ import ( "fiatjaf.com/nostr" "fiatjaf.com/nostr/eventstore/lmdb" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestBleveFlow(t *testing.T) { @@ -21,7 +22,9 @@ func TestBleveFlow(t *testing.T) { Path: "/tmp/blevetest-bleve", RawEventStore: bb, } - bl.Init() + err := bl.Init() + require.NoError(t, err, "init") + defer bl.Close() willDelete := make([]nostr.Event, 0, 3) diff --git a/eventstore/bleve/delete.go b/eventstore/bleve/delete.go deleted file mode 100644 index 4507dad..0000000 --- a/eventstore/bleve/delete.go +++ /dev/null @@ -1,9 +0,0 @@ -package bleve - -import ( - "fiatjaf.com/nostr" -) - -func (b *BleveBackend) DeleteEvent(id nostr.ID) error { - return b.index.Delete(id.Hex()) -} diff --git a/eventstore/bleve/helpers.go b/eventstore/bleve/helpers.go deleted file mode 100644 index 2d89f76..0000000 --- a/eventstore/bleve/helpers.go +++ /dev/null @@ -1,9 +0,0 @@ -package bleve - -const ( - idField = "i" - contentField = "c" - kindField = "k" - createdAtField = "a" - pubkeyField = "p" -) diff --git a/eventstore/bleve/lexer.go b/eventstore/bleve/lexer.go new file mode 100644 index 0000000..cdfe494 --- /dev/null +++ b/eventstore/bleve/lexer.go @@ -0,0 +1,104 @@ +package bleve + +import ( + "strings" + "unicode" +) + +// lexer tokenizes the input string +type Lexer struct { + input string + pos int + + peekedQueue []Token +} + +func NewLexer(input string) *Lexer { + return &Lexer{input: input, pos: 0} +} + +func (l *Lexer) peek() rune { + if l.pos >= len(l.input) { + return 0 + } + return rune(l.input[l.pos]) +} + +func (l *Lexer) advance() rune { + if l.pos >= len(l.input) { + return 0 + } + ch := rune(l.input[l.pos]) + l.pos++ + return ch +} + +func (l *Lexer) skipWhitespace() { + for l.peek() != 0 && unicode.IsSpace(l.peek()) { + l.advance() + } +} + +func (l *Lexer) readWord() string { + start := l.pos + + // read regular word (alphanumeric, hyphens, underscores) + for l.peek() != 0 && !unicode.IsSpace(l.peek()) && + l.peek() != '(' && l.peek() != ')' && l.peek() != '"' { + l.advance() + } + + return l.input[start:l.pos] +} + +func (l *Lexer) PeekToken() Token { + next := l.NextToken() + l.peekedQueue = append(l.peekedQueue, next) + return next +} + +func (l *Lexer) ReturnToken(tok Token) { + l.peekedQueue = append(l.peekedQueue, tok) +} + +func (l *Lexer) NextToken() (tok Token) { + if len(l.peekedQueue) > 0 { + next := l.peekedQueue[len(l.peekedQueue)-1] + l.peekedQueue = l.peekedQueue[0 : len(l.peekedQueue)-1] + return next + } + + l.skipWhitespace() + + if l.pos >= len(l.input) { + return Token{Type: TokenEOF} + } + + ch := l.peek() + + switch ch { + case '(': + l.advance() + return Token{Type: TokenLParen, Value: "("} + case ')': + l.advance() + return Token{Type: TokenRParen, Value: ")"} + case '"': + l.advance() + return Token{Type: TokenQuote, Value: "\""} + default: + word := l.readWord() + upperWord := strings.ToUpper(word) + + switch upperWord { + case "OR", "||": + return Token{Type: TokenOR, Value: word} + case "AND", "&&": + return Token{Type: TokenAND, Value: word} + case "NOT", "!": + return Token{Type: TokenNOT, Value: word} + default: + return Token{Type: TokenWord, Value: word} + } + } +} diff --git a/eventstore/bleve/lib.go b/eventstore/bleve/lib.go index 10138c9..5072417 100644 --- a/eventstore/bleve/lib.go +++ b/eventstore/bleve/lib.go @@ -1,34 +1,99 @@ package bleve import ( + "encoding/json" "errors" "fmt" + "iter" + "slices" + "strconv" + "strings" "sync" + "time" "fiatjaf.com/nostr" "fiatjaf.com/nostr/eventstore" + "fiatjaf.com/nostr/nip27" + "fiatjaf.com/nostr/nip73" + "fiatjaf.com/nostr/sdk" bleve "github.com/blevesearch/bleve/v2" + _ "github.com/blevesearch/bleve/v2/analysis/analyzer/simple" + _ "github.com/blevesearch/bleve/v2/analysis/lang/ar" + _ "github.com/blevesearch/bleve/v2/analysis/lang/cjk" + _ "github.com/blevesearch/bleve/v2/analysis/lang/da" + _ "github.com/blevesearch/bleve/v2/analysis/lang/de" + _ "github.com/blevesearch/bleve/v2/analysis/lang/en" + _ "github.com/blevesearch/bleve/v2/analysis/lang/es" + _ "github.com/blevesearch/bleve/v2/analysis/lang/fa" + _ "github.com/blevesearch/bleve/v2/analysis/lang/fi" + _ "github.com/blevesearch/bleve/v2/analysis/lang/fr" + _ "github.com/blevesearch/bleve/v2/analysis/lang/gl" + _ "github.com/blevesearch/bleve/v2/analysis/lang/hi" + _ "github.com/blevesearch/bleve/v2/analysis/lang/hr" + _ "github.com/blevesearch/bleve/v2/analysis/lang/hu" + _ "github.com/blevesearch/bleve/v2/analysis/lang/in" + _ "github.com/blevesearch/bleve/v2/analysis/lang/it" + _ "github.com/blevesearch/bleve/v2/analysis/lang/nl" + _ "github.com/blevesearch/bleve/v2/analysis/lang/no" + _ "github.com/blevesearch/bleve/v2/analysis/lang/pl" + _ "github.com/blevesearch/bleve/v2/analysis/lang/pt" + _ "github.com/blevesearch/bleve/v2/analysis/lang/ro" + _ "github.com/blevesearch/bleve/v2/analysis/lang/ru" + _ "github.com/blevesearch/bleve/v2/analysis/lang/sv" + _ "github.com/blevesearch/bleve/v2/analysis/lang/tr" bleveMapping "github.com/blevesearch/bleve/v2/mapping" + bleveQuery "github.com/blevesearch/bleve/v2/search/query" + "github.com/pemistahl/lingua-go" ) -var _ eventstore.Store = (*BleveBackend)(nil) +const ( + labelContentField = "c" + labelKindField = "k" + labelCreatedAtField = "a" + labelAuthorField = "p" + labelReferencesField = "r" + labelExtrasField = "x" +) + +var SupportedLanguages = []lingua.Language{ + // each of these translates to a specific bleve analyzer + // except for japanese-korean-chinese that all use the same "cjk" analyzer + lingua.Arabic, + lingua.Chinese, + lingua.Croatian, + lingua.Danish, + lingua.Dutch, + lingua.English, + lingua.Finnish, + lingua.French, + lingua.German, + lingua.Hindi, + lingua.Hungarian, + lingua.Italian, + lingua.Japanese, + lingua.Korean, + lingua.Persian, + lingua.Polish, + lingua.Portuguese, + lingua.Romanian, + lingua.Russian, + lingua.Spanish, + lingua.Swedish, + lingua.Turkish, +} type BleveBackend struct { sync.Mutex - // Path is where the index will be saved - Path string + Path string + RawEventStore eventstore.Store + ReadOnly bool + IndexableKinds []nostr.Kind - // RawEventStore is where we'll fetch the raw events from - // bleve will only store ids, so the actual events must be somewhere else - RawEventStore eventstore.Store + Languages []lingua.Language + languageCodes []string - index bleve.Index -} - -func (b *BleveBackend) Close() { - if b.index != nil { - b.index.Close() - } + index bleve.Index + detector lingua.LanguageDetector } func (b *BleveBackend) Init() error { @@ -38,12 +103,89 @@ func (b *BleveBackend) Init() error { if b.RawEventStore == nil { return fmt.Errorf("missing RawEventStore") } + if len(b.IndexableKinds) == 0 { + b.IndexableKinds = []nostr.Kind{0, 1, 6, 11, 16, 20, 21, 22, 24, 1111, 9802, 30023, 30818} + } - // try to open existing index - index, err := bleve.Open(b.Path) + if len(b.Languages) == 0 { + b.Languages = SupportedLanguages + } + validLanguages := make([]lingua.Language, 0, len(b.Languages)) + b.languageCodes = make([]string, 0, len(b.Languages)) + for _, lang := range b.Languages { + var code string + + switch lang { + case lingua.Chinese, lingua.Korean, lingua.Japanese: + code = "cjk" + default: + code = strings.ToLower(lang.IsoCode639_1().String()) + } + + if slices.Contains(b.languageCodes, code) { + continue + } + + validLanguages = append(validLanguages, lang) + b.languageCodes = append(b.languageCodes, code) + } + b.Languages = validLanguages + + index, err := bleve.OpenUsing(b.Path, map[string]any{ + "read_only": b.ReadOnly, + }) if err == bleve.ErrorIndexPathDoesNotExist { - // create new index with default mapping mapping := bleveMapping.NewIndexMapping() + mapping.DefaultMapping.Dynamic = false + doc := bleveMapping.NewDocumentStaticMapping() + + for _, code := range b.languageCodes { + contentField := bleveMapping.NewTextFieldMapping() + contentField.Analyzer = code + contentField.Store = false + contentField.IncludeTermVectors = false + contentField.DocValues = false + contentField.IncludeInAll = false + doc.AddFieldMappingsAt(labelContentField+"_"+code, contentField) + } + + extrasField := bleveMapping.NewTextFieldMapping() + extrasField.Analyzer = "simple" + extrasField.Store = false + extrasField.IncludeTermVectors = false + extrasField.DocValues = false + extrasField.IncludeInAll = false + doc.AddFieldMappingsAt(labelExtrasField, extrasField) + + referencesField := bleveMapping.NewKeywordFieldMapping() + referencesField.DocValues = false + referencesField.Store = false + referencesField.IncludeTermVectors = false + referencesField.IncludeInAll = false + doc.AddFieldMappingsAt(labelReferencesField, referencesField) + + authorField := bleveMapping.NewKeywordFieldMapping() + authorField.DocValues = false + authorField.Store = false + authorField.IncludeTermVectors = false + doc.AddFieldMappingsAt(labelAuthorField, authorField) + + kindField := bleveMapping.NewKeywordFieldMapping() + kindField.DocValues = false + kindField.Store = false + kindField.IncludeTermVectors = false + kindField.IncludeInAll = false + doc.AddFieldMappingsAt(labelKindField, kindField) + + timestampField := bleveMapping.NewDateTimeFieldMapping() + timestampField.DocValues = false + timestampField.Store = false + timestampField.IncludeTermVectors = false + timestampField.IncludeInAll = false + doc.AddFieldMappingsAt(labelCreatedAtField, timestampField) + + mapping.AddDocumentMapping("_default", doc) + index, err = bleve.New(b.Path, mapping) if err != nil { return fmt.Errorf("error creating index: %w", err) @@ -53,6 +195,116 @@ func (b *BleveBackend) Init() error { } b.index = index + b.detector = lingua.NewLanguageDetectorBuilder(). + FromLanguages(b.Languages...). + Build() + + return nil +} + +func (b *BleveBackend) Close() { + if b != nil && b.index != nil { + b.index.Close() + } +} + +func (b *BleveBackend) SaveEvent(event nostr.Event) error { + if slices.Contains(b.IndexableKinds, event.Kind) { + return b.indexEvent(event) + } + return nil +} + +func (b *BleveBackend) DeleteEvent(id nostr.ID) error { + if b != nil && b.index != nil { + return b.index.Delete(id.Hex()) + } + return nil +} + +func (b *BleveBackend) indexEvent(evt nostr.Event) error { + docID := evt.ID + + var references []string + var extras string + + switch evt.Kind { + case 6, 16: + var innerEvt nostr.Event + if err := json.Unmarshal([]byte(evt.Content), &innerEvt); err != nil || !innerEvt.VerifySignature() { + return nil + } + evt = innerEvt + case 0: + var pm sdk.ProfileMetadata + if err := json.Unmarshal([]byte(evt.Content), &pm); err == nil { + evt.Content = pm.Name + "\n" + pm.DisplayName + "\n" + pm.About + references = append(references, pm.NIP05) + } + case 9802: + for _, tag := range evt.Tags { + if len(tag) < 2 { + continue + } + switch tag[0] { + case "comment": + evt.Content += "\n\n" + tag[1] + case "e": + if ptr, err := nostr.EventPointerFromTag(tag); err == nil { + references = append(references, ptr.AsTagReference()) + } + case "a": + if ptr, err := nostr.EntityPointerFromTag(tag); err == nil { + references = append(references, ptr.AsTagReference()) + } + case "r": + references = append(references, tag[1]) + } + } + } + + doc := map[string]any{ + labelKindField: strconv.Itoa(int(evt.Kind)), + labelAuthorField: evt.PubKey.Hex()[56:], + labelCreatedAtField: evt.CreatedAt.Time(), + } + + content := strings.Builder{} + content.Grow(len(evt.Content)) + + for block := range nip27.Parse(evt.Content) { + if block.Pointer == nil { + content.WriteString(strings.TrimSpace(block.Text)) + } else { + references = append(references, block.Pointer.AsTagReference()) + if ep, ok := block.Pointer.(nip73.ExternalPointer); ok { + extras += ep.Thing + " " + } + } + } + + indexableContent := content.String() + lang, ok := b.detector.DetectLanguageOf(indexableContent) + if !ok { + lang = lingua.English + } + + var analyzerLangCode string + switch lang { + case lingua.Japanese, lingua.Chinese, lingua.Korean: + analyzerLangCode = "cjk" + default: + analyzerLangCode = strings.ToLower(lang.IsoCode639_1().String()) + } + doc[labelContentField+"_"+analyzerLangCode] = indexableContent + + doc[labelReferencesField] = references + doc[labelExtrasField] = extras + + if err := b.index.Index(docID.Hex(), doc); err != nil { + return fmt.Errorf("failed to index '%s' document: %w", docID.Hex(), err) + } + return nil } @@ -64,3 +316,154 @@ func (b *BleveBackend) CountEvents(filter nostr.Filter) (uint32, error) { return 0, errors.New("not supported") } + +func (b *BleveBackend) QueryEvents(filter nostr.Filter, maxLimit int) iter.Seq[nostr.Event] { + return func(yield func(nostr.Event) bool) { + if tlimit := filter.GetTheoreticalLimit(); tlimit == 0 { + return + } else if tlimit < maxLimit { + maxLimit = tlimit + } + + filter.Search = strings.TrimSpace(filter.Search) + if len(filter.Search) < 2 { + return + } + + and := make([]bleveQuery.Query, 0, 3) + + searchC := strings.Builder{} + searchC.Grow(len(filter.Search)) + + for block := range nip27.Parse(filter.Search) { + if block.Pointer != nil { + genericRef := bleve.NewTermQuery(block.Pointer.AsTagReference()) + genericRef.SetField(labelReferencesField) + genericRef.SetBoost(2) + + var ref bleveQuery.Query = genericRef + if profile, ok := block.Pointer.(nostr.ProfilePointer); ok { + authorQuery := bleve.NewTermQuery(profile.PublicKey.Hex()[56:]) + authorQuery.SetField(labelAuthorField) + authorQuery.SetBoost(2) + orRef := bleve.NewDisjunctionQuery() + orRef.AddQuery(genericRef) + orRef.AddQuery(authorQuery) + ref = orRef + } else if addr, ok := block.Pointer.(nostr.EntityPointer); ok { + authorQuery := bleve.NewTermQuery(addr.PublicKey.Hex()[56:]) + authorQuery.SetField(labelAuthorField) + authorQuery.SetBoost(2) + orRef := bleve.NewDisjunctionQuery() + orRef.AddQuery(genericRef) + orRef.AddQuery(authorQuery) + ref = orRef + } + and = append(and, ref) + } else { + searchC.WriteString(strings.TrimSpace(block.Text)) + } + } + + searchContent := searchC.String() + + var exactMatches []string + if len(searchContent) > 0 { + contentQueries := make([]bleveQuery.Query, 0, len(b.Languages)+1) + + searchQ, exactMatches_, err := parse(searchContent, labelContentField+"_"+b.languageCodes[0]) + if err != nil { + for _, code := range b.languageCodes { + match := bleve.NewMatchQuery(searchContent) + match.SetField(labelContentField + "_" + code) + contentQueries = append(contentQueries, match) + } + } else { + contentQueries = append(contentQueries, searchQ) + for _, code := range b.languageCodes[1:] { + searchQ, _, _ := parse(searchContent, labelContentField+"_"+code) + contentQueries = append(contentQueries, searchQ) + } + } + exactMatches = exactMatches_ + + extrasQ := bleve.NewMatchQuery(searchContent) + extrasQ.SetField(labelExtrasField) + contentQueries = append(contentQueries, extrasQ) + + and = append(and, bleveQuery.NewDisjunctionQuery(contentQueries)) + } + + if len(filter.Kinds) > 0 { + eitherKind := bleve.NewDisjunctionQuery() + for _, kind := range filter.Kinds { + kindQ := bleve.NewTermQuery(strconv.Itoa(int(kind))) + kindQ.SetField(labelKindField) + eitherKind.AddQuery(kindQ) + } + and = append(and, eitherKind) + } + + if len(filter.Authors) > 0 { + eitherPubkey := bleve.NewDisjunctionQuery() + for _, pubkey := range filter.Authors { + pubkeyQ := bleve.NewTermQuery(pubkey.Hex()[56:]) + pubkeyQ.SetField(labelAuthorField) + eitherPubkey.AddQuery(pubkeyQ) + } + and = append(and, eitherPubkey) + } + + if filter.Since != 0 || filter.Until != 0 { + var min time.Time + if filter.Since != 0 { + min = filter.Since.Time() + } + var max time.Time + if filter.Until != 0 { + max = filter.Until.Time() + } else { + max = time.Now() + } + dateRangeQ := bleve.NewDateRangeQuery(min, max) + dateRangeQ.SetField(labelCreatedAtField) + and = append(and, dateRangeQ) + } + + q := bleveQuery.NewConjunctionQuery(and) + req := bleve.NewSearchRequest(q) + req.Size = maxLimit + req.From = 0 + req.Explain = true + + result, err := b.index.Search(req) + if err != nil { + return + } + + resultHit: + for _, hit := range result.Hits { + id, err := nostr.IDFromHex(hit.ID) + if err != nil { + continue + } + for evt := range b.RawEventStore.QueryEvents(nostr.Filter{IDs: []nostr.ID{id}}, 1) { + for _, exactMatch := range exactMatches { + if !strings.Contains(strings.ToLower(evt.Content), exactMatch) { + continue resultHit + } + } + + for f, v := range filter.Tags { + if !evt.Tags.ContainsAny(f, v) { + continue resultHit + } + } + + if !yield(evt) { + return + } + } + } + } +} diff --git a/eventstore/bleve/query.go b/eventstore/bleve/query.go deleted file mode 100644 index 892116d..0000000 --- a/eventstore/bleve/query.go +++ /dev/null @@ -1,94 +0,0 @@ -package bleve - -import ( - "iter" - "strconv" - - "fiatjaf.com/nostr" - bleve "github.com/blevesearch/bleve/v2" - "github.com/blevesearch/bleve/v2/search/query" -) - -func (b *BleveBackend) QueryEvents(filter nostr.Filter, maxLimit int) iter.Seq[nostr.Event] { - return func(yield func(nostr.Event) bool) { - if tlimit := filter.GetTheoreticalLimit(); tlimit == 0 { - return - } else if tlimit < maxLimit { - maxLimit = tlimit - } - - if len(filter.Search) < 2 { - return - } - - searchQ := bleve.NewMatchQuery(filter.Search) - searchQ.SetField(contentField) - var q query.Query = searchQ - - conjQueries := []query.Query{searchQ} - - if len(filter.Kinds) > 0 { - eitherKind := bleve.NewDisjunctionQuery() - for _, kind := range filter.Kinds { - kindQ := bleve.NewTermQuery(strconv.Itoa(int(kind))) - kindQ.SetField(kindField) - eitherKind.AddQuery(kindQ) - } - conjQueries = append(conjQueries, eitherKind) - } - - if len(filter.Authors) > 0 { - eitherPubkey := bleve.NewDisjunctionQuery() - for _, pubkey := range filter.Authors { - if len(pubkey) != 64 { - continue - } - pubkeyQ := bleve.NewTermQuery(pubkey.Hex()[56:]) - pubkeyQ.SetField(pubkeyField) - eitherPubkey.AddQuery(pubkeyQ) - } - conjQueries = append(conjQueries, eitherPubkey) - } - - if filter.Since != 0 || filter.Until != 0 { - var min *float64 - if filter.Since != 0 { - minVal := float64(filter.Since) - min = &minVal - } - var max *float64 - if filter.Until != 0 { - maxVal := float64(filter.Until) - max = &maxVal - } - dateRangeQ := bleve.NewNumericRangeInclusiveQuery(min, max, nil, nil) - dateRangeQ.SetField(createdAtField) - conjQueries = append(conjQueries, dateRangeQ) - } - - if len(conjQueries) > 1 { - q = bleve.NewConjunctionQuery(conjQueries...) - } - - req := bleve.NewSearchRequest(q) - req.Size = maxLimit - req.From = 0 - - result, err := b.index.Search(req) - if err != nil { - return - } - - for _, hit := range result.Hits { - id, err := nostr.IDFromHex(hit.ID) - if err != nil { - continue - } - for evt := range b.RawEventStore.QueryEvents(nostr.Filter{IDs: []nostr.ID{id}}, 1) { - if !yield(evt) { - return - } - } - } - } -} diff --git a/eventstore/bleve/query_parser.go b/eventstore/bleve/query_parser.go new file mode 100644 index 0000000..c6a9ae6 --- /dev/null +++ b/eventstore/bleve/query_parser.go @@ -0,0 +1,209 @@ +package bleve + +import ( + "strings" + + bleve "github.com/blevesearch/bleve/v2" + bleveQuery "github.com/blevesearch/bleve/v2/search/query" +) + +// token types +type TokenType int + +const ( + TokenWord TokenType = iota + TokenOR + TokenAND + TokenNOT + TokenLParen + TokenRParen + TokenQuote + TokenEOF +) + +type Token struct { + Type TokenType + Value string +} + +type Parser struct { + lexer *Lexer + field string +} + +func parse(input string, field string) (bleveQuery.Query, []string, error) { + lexer := NewLexer(input) + p := &Parser{ + lexer: lexer, + } + + var exactMatches []string + var reusableCurrentMatch strings.Builder + var currentExactMatch *strings.Builder + var currentWords []string + var negated bool + var parents []bleveQuery.Query + var parentOps []TokenType // tracks if parent should be AND or OR + var lastOp TokenType = TokenAND // track last operator for parentheses + + curr := bleve.NewBooleanQuery() + + for { + token := p.lexer.NextToken() + + if token.Type == TokenEOF { + if len(currentWords) > 0 { + match := bleve.NewMatchQuery(strings.Join(currentWords, " ")) + match.SetOperator(bleveQuery.MatchQueryOperatorAnd) + match.SetField(field) + if negated { + curr.AddMustNot(match) + } else { + curr.AddMust(match) + } + } + break + } + + if token.Type == TokenQuote { + if currentExactMatch == nil { + currentExactMatch = &reusableCurrentMatch + } else { + exactMatches = append(exactMatches, currentExactMatch.String()) + currentExactMatch.Reset() + reusableCurrentMatch = *currentExactMatch + currentExactMatch = nil + } + continue + } + + if currentExactMatch != nil { + if currentExactMatch.Len() > 0 { + currentExactMatch.WriteByte(' ') + } + currentExactMatch.WriteString(strings.ToLower(token.Value)) + currentWords = append(currentWords, token.Value) + continue + } + + if token.Type == TokenWord { + currentWords = append(currentWords, token.Value) + continue + } else if len(currentWords) > 0 { + match := bleve.NewMatchQuery(strings.Join(currentWords, " ")) + match.SetOperator(bleveQuery.MatchQueryOperatorAnd) + match.SetField(field) + if negated { + curr.AddMustNot(match) + } else { + curr.AddMust(match) + } + currentWords = currentWords[:0] + negated = false + } + + switch token.Type { + case TokenLParen: + // push current query to parents stack with the last operator + parents = append(parents, curr) + parentOps = append(parentOps, lastOp) + // reset lastOp to default for inner parentheses + lastOp = TokenAND + // start new boolean query for parentheses content + curr = bleve.NewBooleanQuery() + continue + case TokenRParen: + // finalize any remaining words + if len(currentWords) > 0 { + match := bleve.NewMatchQuery(strings.Join(currentWords, " ")) + match.SetOperator(bleveQuery.MatchQueryOperatorAnd) + match.SetField(field) + if negated { + curr.AddMustNot(match) + } else { + curr.AddMust(match) + } + currentWords = currentWords[:0] + negated = false + } + + // pop parent and merge with current + if len(parents) > 0 { + parent := parents[len(parents)-1] + op := parentOps[len(parentOps)-1] + + // create a new boolean query to combine parent and current + var combined bleveQuery.Query + switch op { + case TokenOR: + or := bleve.NewDisjunctionQuery() + or.AddQuery(parent) + or.AddQuery(curr) + combined = or + case TokenAND: + and := bleve.NewConjunctionQuery() + and.AddQuery(parent) + and.AddQuery(curr) + combined = and + } + + curr = bleve.NewBooleanQuery() + curr.AddMust(combined) + parents = parents[:len(parents)-1] + parentOps = parentOps[:len(parentOps)-1] + } + continue + } + + next := p.lexer.NextToken() + following := p.lexer.PeekToken() + if next.Type == TokenNOT { + negated = true + } + + switch token.Type { + case TokenOR: + if next.Type != TokenLParen && !(next.Type == TokenNOT && following.Type == TokenLParen) { + // if this is not followed by a "(" or "NOT (" consider the follow next word as the only parameter + other := bleve.NewMatchQuery(next.Value) + other.SetOperator(bleveQuery.MatchQueryOperatorAnd) + other.SetField(field) + or := bleve.NewDisjunctionQuery() + or.AddQuery(curr) + or.AddQuery(other) + curr = bleve.NewBooleanQuery() + curr.AddMust(or) + } else { + lastOp = TokenOR + } + case TokenAND: + if next.Type != TokenLParen && !(next.Type == TokenNOT && following.Type == TokenLParen) { + // if this is not followed by a "(" consider the follow next word as the only parameter + other := bleve.NewMatchQuery(next.Value) + other.SetOperator(bleveQuery.MatchQueryOperatorAnd) + other.SetField(field) + and := bleve.NewConjunctionQuery() + and.AddQuery(curr) + and.AddQuery(other) + curr = bleve.NewBooleanQuery() + curr.AddMust(and) + } else { + lastOp = TokenAND + } + case TokenNOT: + if next.Type != TokenLParen { + // if this is not followed by a "(" or "NOT (" consider the follow next word as the only parameter + other := bleve.NewMatchQuery(next.Value) + other.SetOperator(bleveQuery.MatchQueryOperatorAnd) + other.SetField(field) + curr.AddMustNot(other) + } else { + negated = true + } + default: + p.lexer.ReturnToken(next) + } + } + + return curr, exactMatches, nil +} diff --git a/eventstore/bleve/query_parser_test.go b/eventstore/bleve/query_parser_test.go new file mode 100644 index 0000000..106bb5c --- /dev/null +++ b/eventstore/bleve/query_parser_test.go @@ -0,0 +1,57 @@ +package bleve + +import ( + "testing" + + "github.com/blevesearch/bleve/v2" + "github.com/stretchr/testify/require" +) + +func TestParseQuery(t *testing.T) { + mapping := bleve.NewIndexMapping() + mapping.DefaultAnalyzer = "en" + index, err := bleve.NewMemOnly(mapping) + require.NoError(t, err) + + docs := []map[string]interface{}{ + {"id": "1", "phrase": "I like fruit especially banana and strawberry"}, + {"id": "2", "phrase": "I like fruit like apples and oranges"}, + {"id": "3", "phrase": "I like vegetables but not fruit"}, + {"id": "4", "phrase": "Banana bread is delicious"}, + {"id": "5", "phrase": "Strawberry jam and banana smoothie"}, + } + + for _, doc := range docs { + err := index.Index(doc["id"].(string), doc) + require.NoError(t, err) + } + + testQueries := []struct { + query string + expected int + exactMatches []string + }{ + {"fruit", 3, nil}, + {"banana (NOT delicious)", 2, nil}, + {"banana (NOT delicious) bread", 0, nil}, + {"smoothie OR apples", 2, nil}, + {"smoothie OR apples (NOT fruit)", 1, nil}, + {"\"I like\"", 3, []string{"i like"}}, + {"banana \"I like fruit\" strawberries", 1, []string{"i like fruit"}}, + {"\"I like fruit\" (strawberry OR apple)", 2, []string{"i like fruit"}}, + } + + for _, test := range testQueries { + query, exactMatches, err := parse(test.query, "phrase") + require.NoError(t, err) + + require.Equal(t, test.exactMatches, exactMatches) + + search := bleve.NewSearchRequest(query) + results, err := index.Search(search) + require.NoError(t, err) + + require.Equal(t, test.expected, int(results.Total), + "query '%s' expected %d results, got %d", test.query, test.expected, results.Total) + } +} diff --git a/eventstore/bleve/replace.go b/eventstore/bleve/replace.go deleted file mode 100644 index 21163b7..0000000 --- a/eventstore/bleve/replace.go +++ /dev/null @@ -1,37 +0,0 @@ -package bleve - -import ( - "fmt" - - "fiatjaf.com/nostr" - "fiatjaf.com/nostr/eventstore" -) - -func (b *BleveBackend) ReplaceEvent(evt nostr.Event) error { - b.Lock() - defer b.Unlock() - - filter := nostr.Filter{Kinds: []nostr.Kind{evt.Kind}, Authors: []nostr.PubKey{evt.PubKey}} - if evt.Kind.IsAddressable() { - filter.Tags = nostr.TagMap{"d": []string{evt.Tags.GetD()}} - } - - shouldStore := true - for previous := range b.QueryEvents(filter, 1) { - if nostr.IsOlder(previous, evt) { - if err := b.DeleteEvent(previous.ID); err != nil { - return fmt.Errorf("failed to delete event for replacing: %w", err) - } - } else { - shouldStore = false - } - } - - if shouldStore { - if err := b.SaveEvent(evt); err != nil && err != eventstore.ErrDupEvent { - return fmt.Errorf("failed to save: %w", err) - } - } - - return nil -} diff --git a/eventstore/bleve/save.go b/eventstore/bleve/save.go deleted file mode 100644 index edb7c33..0000000 --- a/eventstore/bleve/save.go +++ /dev/null @@ -1,23 +0,0 @@ -package bleve - -import ( - "fmt" - "strconv" - - "fiatjaf.com/nostr" -) - -func (b *BleveBackend) SaveEvent(evt nostr.Event) error { - doc := map[string]interface{}{ - contentField: evt.Content, - kindField: strconv.Itoa(int(evt.Kind)), - pubkeyField: evt.PubKey.Hex()[56:], - createdAtField: float64(evt.CreatedAt), - } - - if err := b.index.Index(evt.ID.Hex(), doc); err != nil { - return fmt.Errorf("failed to index '%s' document: %w", evt.ID, err) - } - - return nil -} diff --git a/eventstore/boltdb/lib.go b/eventstore/boltdb/lib.go index 4411aa8..7ba3a6d 100644 --- a/eventstore/boltdb/lib.go +++ b/eventstore/boltdb/lib.go @@ -28,6 +28,8 @@ type BoltBackend struct { MapSize int64 DB *bbolt.DB + ReadOnly bool + EnableHLLCacheFor func(kind nostr.Kind) (useCache bool, skipSavingActualEvent bool) } @@ -36,6 +38,7 @@ func (b *BoltBackend) Init() error { Timeout: 2 * time.Second, PreLoadFreelist: true, FreelistType: bbolt.FreelistMapType, + ReadOnly: b.ReadOnly, }) if err != nil { return err diff --git a/go.mod b/go.mod index 4baecdc..740aa19 100644 --- a/go.mod +++ b/go.mod @@ -43,6 +43,7 @@ require ( fiatjaf.com/lib v0.3.6 github.com/dgraph-io/ristretto/v2 v2.3.0 github.com/go-git/go-git/v5 v5.16.3 + github.com/pemistahl/lingua-go v1.4.0 github.com/sivukhin/godjot v1.0.6 github.com/templexxx/cpu v0.0.1 github.com/templexxx/xhex v0.0.0-20200614015412-aed53437177b @@ -64,6 +65,7 @@ require ( github.com/blevesearch/scorch_segment_api/v2 v2.2.16 // indirect github.com/blevesearch/segment v0.9.1 // indirect github.com/blevesearch/snowballstem v0.9.0 // indirect + github.com/blevesearch/stempel v0.2.0 // indirect github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect github.com/blevesearch/vellum v1.0.11 // indirect github.com/blevesearch/zapx/v11 v11.3.10 // indirect @@ -94,6 +96,7 @@ require ( github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/savsgio/gotils v0.0.0-20240704082632-aef3928b8a38 // indirect + github.com/shopspring/decimal v1.3.1 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect diff --git a/go.sum b/go.sum index 788c24b..55f8e78 100644 --- a/go.sum +++ b/go.sum @@ -42,6 +42,8 @@ github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+j github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs= +github.com/blevesearch/stempel v0.2.0 h1:CYzVPaScODMvgE9o+kf6D4RJ/VRomyi9uHF+PtB+Afc= +github.com/blevesearch/stempel v0.2.0/go.mod h1:wjeTHqQv+nQdbPuJ/YcvOjTInA2EIc6Ks1FoSUzSLvc= github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU= @@ -192,6 +194,8 @@ github.com/onsi/gomega v1.4.1/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5 github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/pemistahl/lingua-go v1.4.0 h1:ifYhthrlW7iO4icdubwlduYnmwU37V1sbNrwhKBR4rM= +github.com/pemistahl/lingua-go v1.4.0/go.mod h1:ECuM1Hp/3hvyh7k8aWSqNCPlTxLemFZsRjocUf3KgME= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -209,6 +213,8 @@ github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/savsgio/gotils v0.0.0-20240704082632-aef3928b8a38 h1:D0vL7YNisV2yqE55+q0lFuGse6U8lxlg7fYTctlT5Gc= github.com/savsgio/gotils v0.0.0-20240704082632-aef3928b8a38/go.mod h1:sM7Mt7uEoCeFSCBM+qBrqvEo+/9vdmj19wzp3yzUhmg= +github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8= +github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/sivukhin/godjot v1.0.6 h1:yoRD+hlcDbSxP9Gd/KRVlEFXgtGyZyt0CHwhY6Gk3EQ= github.com/sivukhin/godjot v1.0.6/go.mod h1:wA6KdR4Z+XpwdwyViPDLWYYxT72pKjNc6XGA9I025gM= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=