From 75381b653edf26c9bee3543a57627b9affcd34f2 Mon Sep 17 00:00:00 2001 From: Jon Staab Date: Wed, 20 May 2026 16:07:58 -0700 Subject: [PATCH] Add search chapter --- book/11-filters.md | 63 ++++++-- book/12-search.md | 214 +++++++++++++++++++++++++ book/plan/search.md | 189 ++++++++++++++++++++++ book/research/search.md | 307 ++++++++++++++++++++++++++++++++++++ coracle-lib/tests/search.rs | 250 +++++++++++++++++++++++++++++ 5 files changed, 1014 insertions(+), 9 deletions(-) create mode 100644 book/12-search.md create mode 100644 book/plan/search.md create mode 100644 book/research/search.md create mode 100644 coracle-lib/tests/search.rs diff --git a/book/11-filters.md b/book/11-filters.md index 789a2c7..66e9d07 100644 --- a/book/11-filters.md +++ b/book/11-filters.md @@ -83,6 +83,10 @@ pub struct Filter { /// Maximum number of events a consumer should return. This is not a /// matching criterion — [`matches`](Filter::matches) ignores it. pub limit: Option, + /// NIP-50 full-text search query. The string is opaque at this point: + /// its structure and local relevance scoring are the subject of the next + /// chapter. `None` means no search constraint. + pub search: Option, } ``` @@ -91,6 +95,10 @@ the struct. But it is a result-count constraint for consumers (relays, storage engines), not a predicate over individual events. The `matches` method ignores it entirely. +The `search` field comes from NIP-50. At this point it is just a string +carried alongside the other fields; the [Search](12-search.md) chapter +gives it meaning — a query model and a local relevance score. + ## Matching Matching walks each present field and returns `false` as soon as one @@ -190,6 +198,7 @@ impl Filter { since: None, until: None, limit: None, + search: None, } } @@ -446,6 +455,9 @@ impl Serialize for Filter { if self.limit.is_some() { count += 1; } + if self.search.is_some() { + count += 1; + } let mut map = serializer.serialize_map(Some(count))?; @@ -477,6 +489,9 @@ impl Serialize for Filter { if let Some(limit) = self.limit { map.serialize_entry("limit", &limit)?; } + if let Some(search) = &self.search { + map.serialize_entry("search", search)?; + } map.end() } @@ -511,6 +526,7 @@ impl<'de> Visitor<'de> for FilterVisitor { let mut since: Option = None; let mut until: Option = None; let mut limit: Option = None; + let mut search: Option = None; while let Some(key) = map.next_key::()? { match key.as_str() { @@ -544,6 +560,7 @@ impl<'de> Visitor<'de> for FilterVisitor { "since" => since = Some(map.next_value()?), "until" => until = Some(map.next_value()?), "limit" => limit = Some(map.next_value()?), + "search" => search = Some(map.next_value()?), other if other.starts_with('#') => { let tag_name = other[1..].to_string(); let values: Vec = map.next_value()?; @@ -563,6 +580,7 @@ impl<'de> Visitor<'de> for FilterVisitor { since, until, limit, + search, }) } } @@ -634,6 +652,7 @@ impl Filter { /// (structural shape) /// - The exact `since` and `until` values (different time windows /// cannot be combined) + /// - The exact `search` query (different searches cannot be combined) /// /// A filter with a `limit` always gets a unique group key, because /// merging limited filters would change result-count semantics. @@ -649,6 +668,7 @@ impl Filter { self.since.hash(&mut hasher); self.until.hash(&mut hasher); + self.search.hash(&mut hasher); if self.limit.is_some() { // Each limited filter gets a unique group — merging two @@ -669,7 +689,9 @@ and a filter on `#p` tags land in different groups — as they should, since merging them by union would change the semantics. Likewise, two filters with different `since` or `until` values land in different groups, because a union of their sets under one time window would either -over-fetch or under-fetch relative to what was requested. +over-fetch or under-fetch relative to what was requested. The `search` +query is treated the same way: two filters with different searches can +never be merged, so each distinct search forms its own group. ## Union and intersection @@ -741,8 +763,9 @@ the earliest `until`. Finally it passes the result through /// /// Set fields are unioned. Time windows are tightened: the latest /// `since` and earliest `until` win. If both filters have a `limit`, -/// the larger one is kept. The result is simplified with -/// [`union_filters`]. +/// the larger one is kept. Two filters carrying *different* searches +/// cannot be combined into one, so the pair is kept separate instead. +/// The result is simplified with [`union_filters`]. pub fn intersect_filters(groups: &[Vec]) -> Vec { let Some(first) = groups.first() else { return vec![]; @@ -755,7 +778,15 @@ pub fn intersect_filters(groups: &[Vec]) -> Vec { for f1 in &result { for f2 in filters { - combined.push(combine_pair(f1, f2)); + match combine_pair(f1, f2) { + Some(f) => combined.push(f), + // Two different searches can't be combined into one + // filter; keep both so neither query is lost. + None => { + combined.push(f1.clone()); + combined.push(f2.clone()); + } + } } } @@ -765,7 +796,16 @@ pub fn intersect_filters(groups: &[Vec]) -> Vec { union_filters(&result) } -fn combine_pair(a: &Filter, b: &Filter) -> Filter { +fn combine_pair(a: &Filter, b: &Filter) -> Option { + // Two different searches cannot be expressed as a single search, so + // there is no filter that satisfies both. Returning `None` tells the + // caller to keep the pair separate rather than fabricate one. + if let (Some(s1), Some(s2)) = (&a.search, &b.search) { + if s1 != s2 { + return None; + } + } + let mut f = Filter::new(); f.ids = union_option_sets(&a.ids, &b.ids); @@ -794,7 +834,10 @@ fn combine_pair(a: &Filter, b: &Filter) -> Filter { (l, None) | (None, l) => l, }; - f + // At most one search is present here (equal searches collapse to one). + f.search = a.search.clone().or_else(|| b.search.clone()); + + Some(f) } fn union_option_sets( @@ -815,6 +858,8 @@ fn union_option_sets( ## What's next -The next chapter extends filters with NIP-50 full-text search — an -optional `search` field that some relays support for content-based -queries. +The `search` field rides along through serialization, grouping, and the +set algebra here, but it has no meaning yet — `matches` doesn't look at +it, and the string is uninterpreted. The next chapter takes up NIP-50 +full-text search: a typed query model that separates terms from +extensions, a local relevance score, and relevance-ordered results. diff --git a/book/12-search.md b/book/12-search.md new file mode 100644 index 0000000..6a2a297 --- /dev/null +++ b/book/12-search.md @@ -0,0 +1,214 @@ +# Search + +NIP-50 adds one field to the filter from the previous chapter: a `search` +string. A relay that advertises the capability reads the string as a +human-readable query — `best nostr apps` — matches it against event content, +and returns results ordered by relevance rather than by `created_at`, with +`limit` applied after ranking. + +Search is opt-in and implementation-defined. Relays decide whether they index events +at all, what matches, and how ranking works. The query may also carry +`key:value` extensions — `domain:`, `language:`, `sentiment:`, `nsfw:`, +`include:spam` — and a relay honors only the ones it understands, ignoring the +rest. There is no global index and no guarantee of completeness: a client +queries the relays it believes support search and accepts a partial view. + +Search may be implemented relay-side, or it may be performed on a client in some +situations. This chapter provides utilities for parsing search terms along with +a very basic model for implementing search that is decoupled from filter matching +itself and entirely opt-in. + +## The module + +```rust {file=coracle-lib/src/lib.rs} +pub mod search; +``` + +```rust {file=coracle-lib/src/search.rs} +//! NIP-50 full-text search queries. +//! +//! A [`SearchQuery`] holds the terms of a search string and computes a +//! best-effort relevance score against event content — for the case where +//! search runs on the client, over events already in hand, rather than on a +//! relay. + +use std::fmt; +``` + +## The query model + +A `SearchQuery` is just the query's terms: the words split out of the search +string. NIP-50 also defines `key:value` extensions, but their meaning is +relay-defined, and the local scorer has no way to evaluate `sentiment:negative` +or `domain:example.com` without data it doesn't have. Rather than model +extensions we can't honor, we treat every token as a term. A relay that +understands an extension still sees it verbatim in the query string; the local +scorer simply matches it as text like any other word. + +```rust {file=coracle-lib/src/search.rs} +/// A parsed NIP-50 search query: the terms of the query string. +/// +/// NIP-50 `key:value` extensions are not modeled separately — their semantics +/// are relay-defined and cannot be evaluated locally, so each is kept as an +/// ordinary term. +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct SearchQuery { + /// The query's terms, in order. + pub terms: Vec, +} +``` + +### Parsing + +Parsing splits the query on whitespace. Every token becomes a term, including +anything that looks like an extension. There is nothing to reject, so parsing is +total — it never errors. + +```rust {file=coracle-lib/src/search.rs} +impl SearchQuery { + /// Create an empty query. + pub fn new() -> Self { + SearchQuery::default() + } + + /// Parse a raw query string by splitting it on whitespace. Every token, + /// extension-like or not, becomes a term. Parsing never fails. + pub fn parse(input: &str) -> Self { + SearchQuery { + terms: input.split_whitespace().map(str::to_string).collect(), + } + } + + /// True when the query has no terms. + pub fn is_empty(&self) -> bool { + self.terms.is_empty() + } +} +``` + +Rendering joins the terms back into a query string. It is the inverse of +parsing: feeding the output of one into the other gives an equal query, modulo +runs of whitespace collapsing to single spaces. + +```rust {file=coracle-lib/src/search.rs} +impl fmt::Display for SearchQuery { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.terms.join(" ")) + } +} +``` + +## Scoring + +NIP-50 returns results in descending order of relevance, so a boolean "matches +or not" is the wrong shape for a local implementation. The scorer instead +returns a number in `0.0..=1.0`, which can drive both inclusion (anything above +zero is a hit) and ordering. + +The score has two parts. The base is the fraction of the query's terms that +appear in the content, compared case-insensitively — three terms, two present, +gives `2/3`. On top of that, repeated occurrences add a small, diminishing +bonus, so that among events matching the same set of terms the ones that mention +them more often rank higher. The bonus is bounded below `1/total`, which means +it can reorder events *within* a fraction but can never push a partial match up +to a full one: a missing term always costs more than any number of repetitions +can recover. An empty query — no terms — scores `1.0`, since there is no text to +constrain. + +```rust {file=coracle-lib/src/search.rs} +impl SearchQuery { + /// Score `content` against this query's terms, in `0.0..=1.0`. + /// + /// The base score is the fraction of the query's terms found in the content + /// (case-insensitive substring). Repeated occurrences add a diminishing + /// bonus, strictly less than one term's worth, so a partial match never + /// reaches `1.0`. An empty query scores `1.0`: there is no text to match. + pub fn score(&self, content: &str) -> f64 { + let total = self.terms.len(); + if total == 0 { + return 1.0; + } + + let haystack = content.to_lowercase(); + + let mut matched = 0usize; + let mut extra = 0usize; + for term in &self.terms { + let needle = term.to_lowercase(); + if needle.is_empty() { + // An empty term imposes no constraint; treat it as present. + matched += 1; + continue; + } + let count = haystack.matches(needle.as_str()).count(); + if count > 0 { + matched += 1; + extra += count - 1; + } + } + + let base = matched as f64 / total as f64; + let bonus = (1.0 - 1.0 / (1.0 + extra as f64)) / total as f64; + (base + bonus).min(1.0) + } +} +``` + +Lowercasing uses `to_lowercase`, which folds case across Unicode rather than +only ASCII. That allocates, but nostr content is multilingual, and correctness +on non-Latin text is worth more than avoiding a copy in a best-effort matcher. + +## Connecting queries to filters + +The previous chapter gave `Filter` a `search` field but no way to set it. The +setters follow the established `add_*` / `clear_*` vocabulary. + +```rust {file=coracle-lib/src/filters.rs} +use crate::search::SearchQuery; + +impl Filter { + /// Set the NIP-50 search query. + pub fn add_search(mut self, search: impl Into) -> Self { + self.search = Some(search.into()); + self + } + + /// Remove the search query, leaving no search constraint. + pub fn clear_search(mut self) -> Self { + self.search = None; + self + } +} +``` + +Scoring an event against a filter is then a matter of parsing the field and +delegating to `SearchQuery::score`. With no search set the method returns `1.0`, +so an unsearched filter never penalizes an event. This is purely the search +dimension — it is independent of the structural `matches` check from the +previous chapter, and the two are meant to be composed by the caller, not folded +together. A consumer that wants search-ranked results filters with `matches`, +scores with `search_score`, and sorts as it sees fit. + +```rust {file=coracle-lib/src/filters.rs} +impl Filter { + /// Best-effort local relevance score for `event`, in `0.0..=1.0`. + /// + /// Parses the `search` field and scores it against the event's content, + /// returning `1.0` when there is no search. This considers *only* the + /// `search` field; it is independent of [`matches`](Filter::matches). + pub fn search_score(&self, event: &Event) -> f64 { + match &self.search { + Some(query) => SearchQuery::parse(query).score(&event.content), + None => 1.0, + } + } +} +``` + +## What's next + +Search depends on routing the query to a relay that actually supports it. +Discovering which relays advertise NIP-50, and choosing among them, is a +networking and relay-metadata concern — the subject of the Domain and Networking +sections, where relay selection is built on top of the filter types assembled +here. diff --git a/book/plan/search.md b/book/plan/search.md new file mode 100644 index 0000000..5c201ec --- /dev/null +++ b/book/plan/search.md @@ -0,0 +1,189 @@ +# Plan: Search + +## Topic Summary + +NIP-50 adds an optional full-text `search` field to the subscription filter from +chapter 11. A relay that supports the capability interprets the query against +event content (and, for some kinds, other fields), returning results ordered by +relevance rather than `created_at`, with `limit` applied after ranking. The +query may carry `key:value` extensions — `domain:`, `language:`, `sentiment:`, +`nsfw:`, `include:spam` — which relays may support or ignore. + +This chapter extends `Filter` with a `search` field, threads it through +serialization / grouping / set algebra, introduces a typed `SearchQuery` that +splits free-text terms from `key:value` extensions, and implements a best-effort +local relevance **score in [0, 1]** used to both include and rank events — +mirroring the NIP's "descending order by quality of result, limit last." + +## Chapter Outline + +1. **Intro / framing** — Search as a relay-defined, optional capability; content + discovery is client-initiated routing, not a global index; results are + partial and ranked by the relay. The local matcher is an honest best-effort + fallback, not a reimplementation of relay search. +2. **The `search` field** — Add `search: Option` to `Filter`; builder + methods `add_search` / `clear_search`; note it joins the derived `Hash` (so + `id()` covers it for free). +3. **Serialization** — Emit/parse a plain `"search"` key in the hand-written + serde impl, present only when `Some`. +4. **The `SearchQuery` model** — A new `search` module: terms + ordered + `key:value` extensions, `parse`, `Display`, builders, and the `Filter` bridge. +5. **Scoring & matching** — `search_score` (fraction-of-terms + diminishing + frequency bonus, capped at 1.0); `matches` includes an event when score > 0; + `rank_search_results` sorts by score then `created_at` and applies `limit`. +6. **Grouping and set algebra** — `search` enters `group()` (distinct searches + never merge); `union_filters` carries it through unchanged; `intersect_filters` + keeps a conflicting-search pair separate instead of fabricating a combined query. +7. **What's next** — Brief pointer to the Domain section (relay selection, + discovering NIP-50-capable relays via relay metadata, is a later concern). + +## API Design + +### `coracle-lib/src/filters.rs` (extends existing `Filter`) + +```rust +pub struct Filter { + // ... existing fields ... + /// NIP-50 full-text search query. Relay-interpreted; see `SearchQuery`. + pub search: Option, +} + +impl Filter { + pub fn add_search(self, search: impl Into) -> Self; // sets Some + pub fn clear_search(self) -> Self; // sets None + + /// Bridge to the typed model. + pub fn add_search_query(self, query: &SearchQuery) -> Self; // = add_search(query.to_string()) + pub fn search_query(&self) -> Option; // parse the field back + + /// Best-effort local relevance score in [0.0, 1.0]. + /// Returns 1.0 when there is no search, or a search with no free-text + /// terms (only extensions, which are unenforceable locally). + pub fn search_score(&self, event: &Event) -> f64; +} + +/// Filter `events` to those matching `filter`, sort by relevance +/// (search_score desc, then created_at desc), and apply `filter.limit`. +pub fn rank_search_results<'a>(filter: &Filter, events: &'a [Event]) -> Vec<&'a Event>; +``` + +`matches` gains a final check: `if self.search_score(event) == 0.0 { return false }`. +Because `search_score` returns 1.0 when there is no search (or no terms), this +only rejects when a search *with terms* matched none of them — i.e. "any term +present ⇒ included." + +### `coracle-lib/src/search.rs` (new module) + +```rust +/// A parsed NIP-50 search query: free-text terms plus `key:value` extensions. +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct SearchQuery { + pub terms: Vec, + pub extensions: Vec<(String, String)>, // ordered; repeats allowed +} + +impl SearchQuery { + pub fn new() -> Self; + /// Total parse: split on whitespace; a token is an extension iff it is + /// `key:value` with key in [A-Za-z0-9_-]+, non-empty value not starting + /// with '/'. Everything else is a term. Never fails. + pub fn parse(input: &str) -> Self; + pub fn add_term(self, term: impl Into) -> Self; + pub fn add_extension(self, key: impl Into, value: impl Into) -> Self; + pub fn is_empty(&self) -> bool; +} + +impl fmt::Display for SearchQuery { /* terms first, then "key:value" exts, space-joined */ } +``` + +`Filter::matches` / `search_score` tokenize via `SearchQuery::parse`, using only +`terms` (extensions are ignored by the local matcher). + +### Scoring formula (`search_score`) + +For the parsed query's distinct `terms` (case-insensitive), against +`event.content` lowercased: + +- `total` = number of distinct terms; if 0 → return 1.0. +- For each term, `count` = non-overlapping occurrences in content. +- `matched` = terms with `count ≥ 1`; `extra` = (Σ count) − matched (repeats + beyond the first hit of each matched term). +- `base = matched / total` (fraction of terms present, in [0, 1]). +- `bonus = (1 − 1/(1 + extra)) / total` (diminishing, strictly `< 1/total`, so a + partial match never reaches the next term's bucket). +- `score = (base + bonus).min(1.0)`. + +Properties (asserted in tests): in [0, 1]; all terms once ⇒ 1.0; missing a term +⇒ `< 1.0`; more occurrences ⇒ ≥ score (monotonic, never exceeds 1.0); no terms +matched ⇒ exactly 0.0. + +## Code Organization + +- **`coracle-lib/src/filters.rs`** — add the `search` field, builders, the + serde changes, `search_score`, the `matches` check, `rank_search_results`, + and the `group()` / `intersect_filters` updates. `use crate::search::SearchQuery;`. +- **`coracle-lib/src/search.rs`** — the `SearchQuery` type. New `pub mod search;` + in `lib.rs`, placed before `filters` (filters depends on it). +- **`coracle-lib/src/prelude.rs`** — add `pub use crate::search::SearchQuery;` + (the prelude already re-exports commonly used items). +- **`coracle-lib/tests/search.rs`** — hand-written integration tests (not tangled). + +## Dependencies + +None new. Parsing and matching use `std` only. No FTS engine — out of scope and +against the minimal-dependency rule. + +## Narrative Notes + +- Open with the philosophy: search is opt-in and relay-defined; no global index; + results partial and relay-ranked. Frame the local scorer as a fallback for + in-memory/offline querying, and warn (per rust-nostr's SDK) that re-filtering a + relay's returned results client-side can wrongly drop legitimate hits — relays + rank with richer, extension-aware logic. +- Explain *why* extensions are parsed but **ignored locally**: `sentiment:`, + `domain:`, etc. require data the client doesn't have, so honoring them locally + is impossible; we keep them in the typed model for *building/inspecting* + queries, not for local evaluation. +- Justify the score model concretely: NIP-50 mandates relevance ordering, so a + boolean match is the wrong shape — a [0,1] score lets us both include + (score > 0) and rank. Walk through the fraction + diminishing-bonus formula + with a small worked example. +- For grouping: reuse the chapter-11 reasoning — two filters with different + searches can't be unioned without changing semantics, so `search` joins the + group key. Show that `union_filters` then keeps them separate automatically. +- For `intersect_filters`: explain the one structural change — `combine_pair` + returns `Option`; a pair whose two searches differ returns `None`, and + the caller emits both filters separately rather than concatenating queries. + +## Design Decisions + +1. **Typed `SearchQuery`, lean/generic.** Terms + a generic ordered list of + `key:value` extensions, with `add_term`/`add_extension`. No per-extension + helpers or typed enums — keeps the surface small and forward-compatible with + relay-specific extensions. (Every reference treats search as opaque; the typed + model is our value-add.) +2. **Local relevance score in [0, 1]**, fraction-of-terms + diminishing frequency + bonus, capped at 1.0. Chosen over a boolean to model NIP-50's relevance + ordering. Extensions excluded from scoring. +3. **`matches` includes on score > 0** ("any term present"); ranking via + `rank_search_results` handles relevance + `limit`-after-sort. +4. **`search` participates in `group()`**, so `union_filters` never merges + distinct searches. +5. **`intersect_filters` keeps a conflicting-search pair separate** (combine + returns `Option`, `None` ⇒ emit both) rather than concatenating, per the + user's choice. +6. **Builder naming `add_search`/`clear_search`** to match the existing + `add_since`/`clear_since` vocabulary (not rust-nostr's `search`/`remove_search`). +7. **Unicode-aware lowercasing** (`to_lowercase`) for the local matcher rather + than ASCII-only, given multilingual nostr content; note the allocation + trade-off. Substring counting via `str::matches`. +8. **Extension parse heuristic** documented: a colon-bearing token like a URL may + be read as an extension; applications needing exact control build + `SearchQuery` field-by-field instead of parsing. + +## Open Questions + +- Exact wording of the frequency-bonus explanation — keep the formula in prose + light; lean on a worked example. (Resolved during writing.) +- Whether `rank_search_results` belongs as a free function (consistent with + `matches_any`/`union_filters`) — yes, free function. diff --git a/book/research/search.md b/book/research/search.md new file mode 100644 index 0000000..3bc58c6 --- /dev/null +++ b/book/research/search.md @@ -0,0 +1,307 @@ +# Research: Search + +## Topic Summary + +NIP-50 adds an optional full-text `search` field to the subscription filter +introduced in chapter 11. A relay that supports the capability interprets the +query string against event content (and, for some kinds, other fields), +returning results ordered by relevance rather than `created_at`. The query may +carry structured extensions in the form of `key:value` pairs — `domain:`, +`language:`, `sentiment:`, `nsfw:`, `include:spam` — which relays may support or +ignore. + +The chapter will: + +1. Add a `search` field to the existing `Filter` type, wiring it through + construction, serialization, hashing, grouping, and the union/intersect + utilities. +2. Introduce a typed `SearchQuery` model that splits free-text terms from + `key:value` extensions, so applications can build and inspect queries safely + instead of stringly-typed concatenation. (This is a deliberate departure + from every reference, which treats the query as an opaque string.) +3. Implement a best-effort, case-insensitive local matcher over event content, + while documenting that real ranking and extension semantics are + relay-defined. + +The code lives in `coracle-lib`: the `search` field extends `filters.rs`, and +the query model gets a dedicated `search.rs` module. + +## Philosophy + +From `ref/building-nostr`, the framing relevant to search is that **content +discovery on nostr is client-initiated routing through relay selection**, not a +query against a global index. Searching is "knowing where to send queries." A +relay that supports NIP-50 is exercising an *optional, relay-authored +capability* — like content curation or access control — and defines its own +matching semantics, including which extensions it honors. This mirrors the NIP's +own "relays SHOULD ignore extensions they don't support." + +Three principles bear directly on the chapter's voice: + +- **No guaranteed completeness.** "No implementation will have a complete view + of every heuristic that is applicable" — so search results are neither global + nor exhaustive. A client queries the relays it knows support search and + accepts a partial, spontaneous view. This should be stated honestly, not hidden. +- **Indexing is the curator's responsibility, not the user's.** Authors publish + signed events; relays (or indexing services) that *want* content discoverable + maintain the index. Clients do nothing special beyond sending a `search` + filter to a search-capable relay. +- **Publicity, not privacy.** Full-text indexing makes content patterns + discoverable and gives relay operators visibility into queries. The honest + framing: search is a publicity feature. + +The takeaway for our library: model `search` as a first-class but optional +filter field, keep the query structured enough that applications can reason +about it, and be candid that local matching is a best-effort approximation of a +relay-defined operation. + +## Reference Implementation Analysis + +### applesauce + +`search` is an optional string on an extended `Filter` type +(`packages/core/src/helpers/filter.ts`): `Filter = CoreFilter & { search?: string }`, +extending nostr-tools' base type. **Opaque** — no extension parsing. + +Dual-mode: relay subscriptions pass the string through verbatim; a local SQLite +backend (`packages/sqlite`) indexes content into an FTS5 table and runs +`events_search MATCH ?` with the raw string double-quote-escaped. Local +client-side `matchFilter()` **ignores** the search field entirely. Pluggable +"search content formatters" decide what gets indexed (default: `content`; +enhanced: kind-0 profile fields plus `t`/`subject`/`title`/`summary`/`d` tags). +Supports `order: "created_at" | "rank"` for FTS5 ranking. Low coupling; SQLite +is optional. No query-extension awareness anywhere. + +### ndk + +`search?: string` on `NDKFilter` (`core/src/subscription/index.ts:30`). +**Opaque, relay-only.** No parsing, no validation (filter-validation pipeline +skips it), no client-side matching (delegates to nostr-tools' `matchFilters`, +which ignores search). No helper functions for building search filters; callers +construct `{ search: "..." }` by hand. The field is serialized and sent to +relays as-is. No NIP-11 capability negotiation or fallback. Minimal by design. + +### nostr-gadgets + +Re-uses `@nostr/tools`' `Filter` type (`search?: string`). **Opaque, +relay-only.** Notably its local stores *reject* search: the in-memory store +returns an empty set if `filter.search` is present, and the RedEventStore docs +state "any filters supported (except 'search')." Provides a hardcoded +`SEARCH_RELAYS` constant (`defaults.ts`): `relay.nostr.band`, `nostr.wine`, +`relay.noswhere.com`, `relay.nos.today`. No query builders, no dynamic relay +capability detection. + +### nostrlib (Go) + +`Search string` on the `Filter` struct (`filter.go`), (de)serialized as a plain +`"search"` JSON key. The core `Filter.Matches` / `MatchesIgnoringTimestampConstraints` +**ignores** search — matching is delegated to eventstore backends. Key-value +backends (BoltDB, LMDB, MMM) return nothing for search queries; only the **Bleve** +backend implements real full-text search: per-document language auto-detection +(lingua-go, 22 languages), per-language analyzers, boolean query syntax +(`AND/OR/NOT`, parens, quoted phrases), NIP-27 reference extraction with 2× boost, +and case-insensitive substring validation of quoted phrases. Kind-0 profiles index +name/display_name/about; reposts unpack inner events. Khatru relay policies +`NoSearchQueries`/`RemoveSearchQueries` let operators disable search. SDK +`SearchUsers()` just sends a `Search` filter to designated user-search relays. No +NIP-50 *extension* parsing (treats `domain:x` as a regular word); a 2-char minimum +query length is enforced by Bleve. + +### nostr-tools + +`search?: string` on the base `Filter` (`filter.ts`). **The canonical +"defined-but-unused" implementation.** `matchFilter()`/`matchFilters()` do not +check search at all; `mergeFilters()` drops it entirely. No parsing, no +validation, no helpers, no tests for the field. Strictly a transport-layer +placeholder so applications can send search filters to relays. Minimal-deps +philosophy: search is purely a relay concern. + +### rust-nostr + +The most directly relevant reference (also Rust). In +`crates/nostr/src/filter.rs`: + +```rust +/// A string describing a query in a human-readable form, i.e. "best nostr apps" +/// +#[serde(skip_serializing_if = "Option::is_none")] +#[serde(default)] +pub search: Option, +``` + +Builder API: `search>(self, value: S) -> Self` and +`remove_search(self) -> Self` — symmetric, generic, `#[inline]`. **Opaque** (no +extension parsing). + +Local matching (`search_match`): + +```rust +fn search_match(&self, event: &Event) -> bool { + match &self.search { + Some(query) => event.content.as_bytes() + .windows(query.len()) + .any(|window| window.eq_ignore_ascii_case(query.as_bytes())), + None => true, + } +} +``` + +Case-insensitive **ASCII** substring via sliding window; `None` matches +everything. Gated by a `MatchEventOptions { nip50: bool, .. }` flag (default +true). Notably, the SDK relay sets `.nip50(false)` with the comment "Skip NIP-50 +matches since they may create issues and ban non-malicious relays" — i.e. +client-side re-matching of a relay's search results can wrongly drop valid hits. +DB backends (LMDB, SQLite) extend matching to a fixed set of searchable tags — +`title`, `description`, `subject`, `name` — lowercasing the query once up front; +empty search → no results. A `Features { full_text_search: bool }` flag declares +backend capability. + +Patterns worth emulating: `Into` builder, `skip_serializing_if` for a +clean wire format, an explicit opt-out for search matching, ASCII case folding +for speed. + +### welshman + +The TypeScript toolkit our library descends from. `search?: string` on `Filter` +(`packages/util/src/Filters.ts`). It is the **only reference that matches search +locally and threads it through filter utilities**: + +```typescript +export const matchFilter = (filter, event) => { + if (!nostrToolsMatchFilter(filter, event)) return false + if (filter.search) { + const content = event.content.toLowerCase() + const terms = filter.search.toLowerCase().split(/\s+/g) + for (const term of terms) { + if (content.includes(term)) return true + return false // <-- bug: returns after first term + } + } + return true +} +``` + +The intent is term-splitting + case-insensitive substring, but the early +`return false` means only the first term is ever checked. **A correct version +should decide AND vs OR across terms explicitly** — this is the one place we can +clearly improve on the reference. + +Filter utilities (directly parallel to our `group`/`union_filters`/`intersect_filters`): + +- `calculateFilterGroup` pushes `search:${search}` into the group key — **a + filter with a search is only mergeable with an identical search.** +- `unionFilters` treats `search` (like `since`/`until`/`limit`) as a scalar + preserved from the first filter in the group, **not merged**. +- `intersectFilters` concatenates differing searches with a space + (`[a, b].join(" ")`) — modeling "must match both" as a compound query — and + takes whichever is present otherwise. +- `getFilterId` includes search in the deterministic hash, so different searches + never dedupe. + +Search-relay selection lives in the router: `getSearchRelays()` returns relays +whose NIP-11 `supported_nips` includes `"50"`. No extension parsing. + +## Common Patterns + +- **`search` is universally an optional plain string.** Every reference models + it as `Option` / `search?: string`. None parse the `key:value` + extensions — they treat the whole query as opaque and let the relay interpret + it. Our typed `SearchQuery` is therefore a value-add, not a port. +- **Local matching is the exception, not the rule.** nostr-tools, ndk, + applesauce (in `matchFilter`), and nostrlib's core `Filter` all *ignore* + search locally; matching happens relay-side (or in a dedicated index like + Bleve/FTS5). Only rust-nostr and welshman attempt local matching, both with + case-insensitive substring over `content`. +- **Where matching exists, it's case-insensitive substring** — rust-nostr does + ASCII-only `eq_ignore_ascii_case` over byte windows (whole query as one + needle); welshman lowercases and splits on whitespace into terms (intending + multi-term, buggily). DB backends additionally search a small fixed set of + metadata tags (`title`, `description`, `subject`, `name`). +- **Search makes filters un-mergeable.** Both welshman (group key) and the + general intuition agree: two filters with different search strings can't be + unioned without changing semantics. rust-nostr sidesteps merging at this layer + entirely. +- **Client-side re-matching is risky.** rust-nostr's SDK disables NIP-50 + matching when filtering relay results, because a relay's notion of a match + (ranked, fuzzy, multi-field, extension-aware) is richer than a client's + substring check — re-filtering can drop legitimate hits. +- **Relay selection by NIP-11.** Search-capable relays are discovered via + `supported_nips` containing `50` (welshman) or a hardcoded allowlist + (nostr-gadgets). This is an application/networking concern, out of scope for + `coracle-lib`. + +## Considerations for Our Implementation + +**Filter field.** Add `pub search: Option` to `Filter`. Follow +rust-nostr: `add_search>(self, S)` and `clear_search(self)` to +match the existing `add_*`/`clear_*` builder vocabulary (our methods are named +`add_since`/`clear_since`, etc., so `add_search`/`clear_search` fits better than +rust-nostr's `search`/`remove_search`). The field already participates in the +derived `Hash` (so `id()` covers it for free), but serialization, `group()`, +`union_filters`, `intersect_filters`, and `matches()` all need explicit updates. + +**Serialization.** Our `Filter` has hand-written serde (to flatten `#tag` keys). +Add `search` as a plain `"search"` key — emit only when `Some` (mirroring +`since`/`until`/`limit`), and read it in the visitor's match arm. A round-trip +test must cover it. + +**Grouping / union / intersect.** Per welshman: include `search` in the +`group()` hash so filters with different searches land in different groups (never +merged). In `union_filters`, since group members share an identical search by +construction, the search carries over via the `or_insert_with(|| filter.clone())` +seed — no special merge needed, but worth a comment. In `combine_pair` +(intersect), decide how to combine two searches: welshman concatenates with a +space. Concatenation is defensible ("must match both") but lossy and surprising; +a cleaner rule for a typed model is to **merge two `SearchQuery` values** (union +their terms and extensions) or, if we keep the field as a string at this layer, +to concatenate with a space and document it. Recommend: concatenate with a space +when both present and differ, matching welshman, and note the limitation. + +**Local matching.** Extend `Filter::matches` to test `search` *after* the cheap +scalar checks. Best-effort, case-insensitive. Two design choices to settle in +planning: +1. Whole-query substring (rust-nostr) vs. term-split AND/OR (welshman, fixed). + A typed `SearchQuery` makes term-split natural: match the free-text terms + (AND across terms reads as the intuitive "all words present"; document it), + and treat `key:value` extensions as *unenforceable locally* — i.e. ignored by + the local matcher, since we can't evaluate `sentiment:` or `domain:` without + external data. This honesty matches the NIP. +2. ASCII (`eq_ignore_ascii_case`) vs. Unicode lowercasing. ASCII is what + rust-nostr ships and is allocation-free; Unicode `to_lowercase` is more + correct for non-Latin content but allocates. Given nostr's multilingual + content, prefer Unicode `to_lowercase` for the local matcher — correctness + over micro-optimization, consistent with our "clarity over cleverness" rule — + and note the trade-off. + + Also document, per rust-nostr's SDK, that local matching is a *fallback*: + relay results should generally be trusted as-is rather than re-filtered. + +**`SearchQuery` model (new `search.rs`).** A struct splitting a query into +free-text `terms: Vec` and `extensions: Vec<(String, String)>` (ordered; +NIP-50 doesn't forbid repeats, and order can matter to relays). Parsing: split on +whitespace, treat a token containing `:` (with a non-empty key before it) as an +extension, everything else as a term. Provide: +- `SearchQuery::parse(&str) -> SearchQuery` (total, never fails — unknown shapes + fall back to terms). +- `Display` / `to_string()` that re-renders to the wire string (terms first or + preserve order; planning to decide). +- Builder helpers: `term`, `extension`, plus typed convenience for the + spec-defined extensions (`domain`, `language`, `sentiment`, `nsfw`, + `include_spam`) — optional, decide scope in planning. +- A bridge to `Filter`: `Filter::add_search` can accept `impl Into` so + both a raw string and `query.to_string()` work; optionally + `Filter::search_query()` to parse the field back out. + +Keep `sentiment`/`nsfw` values as strings (or small enums) — leaning toward +strings to stay forward-compatible with relay-specific values, with named +constructors for the common cases. + +**Dependencies.** None new. Parsing is plain string handling; matching uses std. +Avoid pulling in a real FTS engine — out of scope and against the +minimal-dependency rule. + +**Out of scope (defer / mention only).** Real relevance ranking; relay-side +indexing; NIP-11 search-relay discovery (a networking concern); the `order` +hint from applesauce; multi-field/tag matching beyond `content` (could mention +`title`/`subject` as a possible extension but keep the matcher content-only for +clarity). diff --git a/coracle-lib/tests/search.rs b/coracle-lib/tests/search.rs new file mode 100644 index 0000000..b7cd4d9 --- /dev/null +++ b/coracle-lib/tests/search.rs @@ -0,0 +1,250 @@ +use coracle_lib::events::{Event, EventContent}; +use coracle_lib::filters::{intersect_filters, union_filters, Filter}; +use coracle_lib::keys::SecretKey; +use coracle_lib::search::SearchQuery; + +fn fixed_secret() -> SecretKey { + let bytes: [u8; 32] = [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, + ]; + SecretKey::from_hex(&hex::encode(bytes)).unwrap() +} + +fn make_event(content: &str, created_at: u64) -> Event { + let sk = fixed_secret(); + let hashed = EventContent::new() + .content(content) + .kind(1) + .stamp(created_at) + .own(sk.public_key()) + .hash(); + hashed.clone().sign(sk.sign(&hashed.id)) +} + +fn approx(a: f64, b: f64) -> bool { + (a - b).abs() < 1e-9 +} + +// --- SearchQuery parsing --- + +#[test] +fn parse_splits_on_whitespace() { + let q = SearchQuery::parse("best nostr apps"); + assert_eq!(q.terms, vec!["best", "nostr", "apps"]); +} + +#[test] +fn parse_treats_extensions_as_terms() { + // We don't interpret NIP-50 extensions; every token is just a term. + let q = SearchQuery::parse("nostr domain:example.com language:en"); + assert_eq!(q.terms, vec!["nostr", "domain:example.com", "language:en"]); +} + +#[test] +fn parse_is_empty_for_blank_input() { + assert!(SearchQuery::parse(" ").is_empty()); + assert!(SearchQuery::parse("").is_empty()); + assert!(SearchQuery::new().is_empty()); + assert!(!SearchQuery::parse("nostr").is_empty()); +} + +#[test] +fn display_joins_terms() { + let q = SearchQuery::parse("nostr best apps"); + assert_eq!(q.to_string(), "nostr best apps"); +} + +#[test] +fn display_round_trips_through_parse() { + let q = SearchQuery::parse("nostr best apps language:en"); + assert_eq!(SearchQuery::parse(&q.to_string()), q); +} + +// --- Scoring --- + +#[test] +fn score_full_match_is_one() { + let q = SearchQuery::parse("nostr apps"); + assert!(approx(q.score("i love nostr and apps"), 1.0)); +} + +#[test] +fn score_no_match_is_zero() { + let q = SearchQuery::parse("nostr"); + assert!(approx(q.score("no match here"), 0.0)); +} + +#[test] +fn score_partial_match_is_fraction() { + let q = SearchQuery::parse("nostr apps"); + assert!(approx(q.score("only nostr here"), 0.5)); +} + +#[test] +fn score_is_case_insensitive() { + let q = SearchQuery::parse("NOSTR"); + assert!(approx(q.score("the Nostr protocol"), 1.0)); +} + +#[test] +fn score_extension_like_term_matches_as_text() { + let q = SearchQuery::parse("language:en"); + assert!(approx(q.score("posted with language:en today"), 1.0)); + assert!(approx(q.score("no marker here"), 0.0)); +} + +#[test] +fn score_empty_query_is_one() { + assert!(approx(SearchQuery::parse("").score("anything at all"), 1.0)); + assert!(approx(SearchQuery::new().score(""), 1.0)); +} + +#[test] +fn score_frequency_bonus_orders_partial_matches() { + let q = SearchQuery::parse("alpha beta"); + let once = q.score("alpha only"); + let many = q.score("alpha alpha alpha"); + assert!(approx(once, 0.5)); + assert!(many > once, "repeated term should score higher: {many} vs {once}"); + assert!(many < 1.0, "a partial match must stay below a full match"); +} + +#[test] +fn score_never_exceeds_one() { + let q = SearchQuery::parse("nostr"); + // Heavy repetition of a full match is still capped at 1.0. + assert!(approx(q.score("nostr nostr nostr nostr nostr"), 1.0)); +} + +#[test] +fn score_is_bounded() { + let q = SearchQuery::parse("alpha beta gamma"); + for content in ["", "alpha", "alpha beta", "alpha alpha gamma gamma", "alpha beta gamma"] { + let s = q.score(content); + assert!((0.0..=1.0).contains(&s), "score {s} out of range for {content:?}"); + } +} + +// --- Filter integration --- + +#[test] +fn add_and_clear_search() { + let f = Filter::new().add_search("nostr"); + assert_eq!(f.search.as_deref(), Some("nostr")); + assert_eq!(f.clear_search().search, None); +} + +#[test] +fn search_score_without_search_is_one() { + let event = make_event("anything", 1); + assert!(approx(Filter::new().search_score(&event), 1.0)); +} + +#[test] +fn search_score_with_search() { + let event = make_event("the nostr protocol", 1); + assert!(approx(Filter::new().add_search("nostr").search_score(&event), 1.0)); + assert!(approx(Filter::new().add_search("missing").search_score(&event), 0.0)); +} + +#[test] +fn matches_ignores_search() { + // Structural matching and search scoring are independent. + let event = make_event("hello", 1); + let filter = Filter::new().add_kind(1).add_search("not-in-content"); + assert!(filter.matches(&event), "matches must not consider the search field"); + assert!(approx(filter.search_score(&event), 0.0)); +} + +// --- Serialization --- + +#[test] +fn search_round_trips_through_json() { + let filter = Filter::new().add_kind(1).add_search("best nostr apps"); + let json = serde_json::to_string(&filter).unwrap(); + assert!(json.contains("\"search\":\"best nostr apps\"")); + let parsed: Filter = serde_json::from_str(&json).unwrap(); + assert_eq!(filter, parsed); +} + +#[test] +fn search_absent_when_none() { + let json = serde_json::to_string(&Filter::new().add_kind(1)).unwrap(); + assert!(!json.contains("search")); +} + +// --- Grouping and set algebra --- + +#[test] +fn group_distinguishes_searches() { + let f1 = Filter::new().add_kind(1).add_search("alpha"); + let f2 = Filter::new().add_kind(1).add_search("beta"); + let f3 = Filter::new().add_kind(1).add_search("alpha"); + assert_ne!(f1.group(), f2.group()); + assert_eq!(f1.group(), f3.group()); +} + +#[test] +fn union_keeps_different_searches_separate() { + let filters = vec![ + Filter::new().add_kind(1).add_search("alpha"), + Filter::new().add_kind(1).add_search("beta"), + ]; + assert_eq!(union_filters(&filters).len(), 2); +} + +#[test] +fn union_merges_same_search() { + let a = SecretKey::generate().public_key(); + let b = SecretKey::generate().public_key(); + let filters = vec![ + Filter::new().add_search("x").add_author(a), + Filter::new().add_search("x").add_author(b), + ]; + let result = union_filters(&filters); + assert_eq!(result.len(), 1); + assert_eq!(result[0].search.as_deref(), Some("x")); + let authors = result[0].authors.as_ref().unwrap(); + assert!(authors.contains(&a) && authors.contains(&b)); +} + +#[test] +fn intersect_keeps_conflicting_searches_separate() { + let groups = vec![ + vec![Filter::new().add_search("nostr")], + vec![Filter::new().add_search("bitcoin")], + ]; + let result = intersect_filters(&groups); + assert_eq!(result.len(), 2); + let searches: std::collections::BTreeSet<_> = + result.iter().map(|f| f.search.clone()).collect(); + assert!(searches.contains(&Some("nostr".to_string()))); + assert!(searches.contains(&Some("bitcoin".to_string()))); +} + +#[test] +fn intersect_combines_when_one_side_has_search() { + let author = SecretKey::generate().public_key(); + let groups = vec![ + vec![Filter::new().add_author(author)], + vec![Filter::new().add_search("nostr")], + ]; + let result = intersect_filters(&groups); + assert_eq!(result.len(), 1); + assert_eq!(result[0].search.as_deref(), Some("nostr")); + assert!(result[0].authors.as_ref().unwrap().contains(&author)); +} + +#[test] +fn intersect_combines_equal_searches() { + let groups = vec![ + vec![Filter::new().add_kind(1).add_search("x")], + vec![Filter::new().add_kind(2).add_search("x")], + ]; + let result = intersect_filters(&groups); + assert_eq!(result.len(), 1); + assert_eq!(result[0].search.as_deref(), Some("x")); + let kinds = result[0].kinds.as_ref().unwrap(); + assert!(kinds.contains(&1) && kinds.contains(&2)); +}