Add search chapter

This commit is contained in:
Jon Staab
2026-05-20 16:07:58 -07:00
parent d0709e1811
commit 75381b653e
5 changed files with 1014 additions and 9 deletions
+54 -9
View File
@@ -83,6 +83,10 @@ pub struct Filter {
/// Maximum number of events a consumer should return. This is not a /// Maximum number of events a consumer should return. This is not a
/// matching criterion — [`matches`](Filter::matches) ignores it. /// matching criterion — [`matches`](Filter::matches) ignores it.
pub limit: Option<usize>, pub limit: Option<usize>,
/// NIP-50 full-text search query. The string is opaque at this point:
/// its structure and local relevance scoring are the subject of the next
/// chapter. `None` means no search constraint.
pub search: Option<String>,
} }
``` ```
@@ -91,6 +95,10 @@ the struct. But it is a result-count constraint for consumers (relays,
storage engines), not a predicate over individual events. The `matches` storage engines), not a predicate over individual events. The `matches`
method ignores it entirely. method ignores it entirely.
The `search` field comes from NIP-50. At this point it is just a string
carried alongside the other fields; the [Search](12-search.md) chapter
gives it meaning — a query model and a local relevance score.
## Matching ## Matching
Matching walks each present field and returns `false` as soon as one Matching walks each present field and returns `false` as soon as one
@@ -190,6 +198,7 @@ impl Filter {
since: None, since: None,
until: None, until: None,
limit: None, limit: None,
search: None,
} }
} }
@@ -446,6 +455,9 @@ impl Serialize for Filter {
if self.limit.is_some() { if self.limit.is_some() {
count += 1; count += 1;
} }
if self.search.is_some() {
count += 1;
}
let mut map = serializer.serialize_map(Some(count))?; let mut map = serializer.serialize_map(Some(count))?;
@@ -477,6 +489,9 @@ impl Serialize for Filter {
if let Some(limit) = self.limit { if let Some(limit) = self.limit {
map.serialize_entry("limit", &limit)?; map.serialize_entry("limit", &limit)?;
} }
if let Some(search) = &self.search {
map.serialize_entry("search", search)?;
}
map.end() map.end()
} }
@@ -511,6 +526,7 @@ impl<'de> Visitor<'de> for FilterVisitor {
let mut since: Option<u64> = None; let mut since: Option<u64> = None;
let mut until: Option<u64> = None; let mut until: Option<u64> = None;
let mut limit: Option<usize> = None; let mut limit: Option<usize> = None;
let mut search: Option<String> = None;
while let Some(key) = map.next_key::<String>()? { while let Some(key) = map.next_key::<String>()? {
match key.as_str() { match key.as_str() {
@@ -544,6 +560,7 @@ impl<'de> Visitor<'de> for FilterVisitor {
"since" => since = Some(map.next_value()?), "since" => since = Some(map.next_value()?),
"until" => until = Some(map.next_value()?), "until" => until = Some(map.next_value()?),
"limit" => limit = Some(map.next_value()?), "limit" => limit = Some(map.next_value()?),
"search" => search = Some(map.next_value()?),
other if other.starts_with('#') => { other if other.starts_with('#') => {
let tag_name = other[1..].to_string(); let tag_name = other[1..].to_string();
let values: Vec<String> = map.next_value()?; let values: Vec<String> = map.next_value()?;
@@ -563,6 +580,7 @@ impl<'de> Visitor<'de> for FilterVisitor {
since, since,
until, until,
limit, limit,
search,
}) })
} }
} }
@@ -634,6 +652,7 @@ impl Filter {
/// (structural shape) /// (structural shape)
/// - The exact `since` and `until` values (different time windows /// - The exact `since` and `until` values (different time windows
/// cannot be combined) /// cannot be combined)
/// - The exact `search` query (different searches cannot be combined)
/// ///
/// A filter with a `limit` always gets a unique group key, because /// A filter with a `limit` always gets a unique group key, because
/// merging limited filters would change result-count semantics. /// merging limited filters would change result-count semantics.
@@ -649,6 +668,7 @@ impl Filter {
self.since.hash(&mut hasher); self.since.hash(&mut hasher);
self.until.hash(&mut hasher); self.until.hash(&mut hasher);
self.search.hash(&mut hasher);
if self.limit.is_some() { if self.limit.is_some() {
// Each limited filter gets a unique group — merging two // Each limited filter gets a unique group — merging two
@@ -669,7 +689,9 @@ and a filter on `#p` tags land in different groups — as they should,
since merging them by union would change the semantics. Likewise, two since merging them by union would change the semantics. Likewise, two
filters with different `since` or `until` values land in different filters with different `since` or `until` values land in different
groups, because a union of their sets under one time window would either groups, because a union of their sets under one time window would either
over-fetch or under-fetch relative to what was requested. over-fetch or under-fetch relative to what was requested. The `search`
query is treated the same way: two filters with different searches can
never be merged, so each distinct search forms its own group.
## Union and intersection ## Union and intersection
@@ -741,8 +763,9 @@ the earliest `until`. Finally it passes the result through
/// ///
/// Set fields are unioned. Time windows are tightened: the latest /// Set fields are unioned. Time windows are tightened: the latest
/// `since` and earliest `until` win. If both filters have a `limit`, /// `since` and earliest `until` win. If both filters have a `limit`,
/// the larger one is kept. The result is simplified with /// the larger one is kept. Two filters carrying *different* searches
/// [`union_filters`]. /// cannot be combined into one, so the pair is kept separate instead.
/// The result is simplified with [`union_filters`].
pub fn intersect_filters(groups: &[Vec<Filter>]) -> Vec<Filter> { pub fn intersect_filters(groups: &[Vec<Filter>]) -> Vec<Filter> {
let Some(first) = groups.first() else { let Some(first) = groups.first() else {
return vec![]; return vec![];
@@ -755,7 +778,15 @@ pub fn intersect_filters(groups: &[Vec<Filter>]) -> Vec<Filter> {
for f1 in &result { for f1 in &result {
for f2 in filters { for f2 in filters {
combined.push(combine_pair(f1, f2)); match combine_pair(f1, f2) {
Some(f) => combined.push(f),
// Two different searches can't be combined into one
// filter; keep both so neither query is lost.
None => {
combined.push(f1.clone());
combined.push(f2.clone());
}
}
} }
} }
@@ -765,7 +796,16 @@ pub fn intersect_filters(groups: &[Vec<Filter>]) -> Vec<Filter> {
union_filters(&result) union_filters(&result)
} }
fn combine_pair(a: &Filter, b: &Filter) -> Filter { fn combine_pair(a: &Filter, b: &Filter) -> Option<Filter> {
// Two different searches cannot be expressed as a single search, so
// there is no filter that satisfies both. Returning `None` tells the
// caller to keep the pair separate rather than fabricate one.
if let (Some(s1), Some(s2)) = (&a.search, &b.search) {
if s1 != s2 {
return None;
}
}
let mut f = Filter::new(); let mut f = Filter::new();
f.ids = union_option_sets(&a.ids, &b.ids); f.ids = union_option_sets(&a.ids, &b.ids);
@@ -794,7 +834,10 @@ fn combine_pair(a: &Filter, b: &Filter) -> Filter {
(l, None) | (None, l) => l, (l, None) | (None, l) => l,
}; };
f // At most one search is present here (equal searches collapse to one).
f.search = a.search.clone().or_else(|| b.search.clone());
Some(f)
} }
fn union_option_sets<T: Ord + Clone>( fn union_option_sets<T: Ord + Clone>(
@@ -815,6 +858,8 @@ fn union_option_sets<T: Ord + Clone>(
## What's next ## What's next
The next chapter extends filters with NIP-50 full-text search — an The `search` field rides along through serialization, grouping, and the
optional `search` field that some relays support for content-based set algebra here, but it has no meaning yet — `matches` doesn't look at
queries. it, and the string is uninterpreted. The next chapter takes up NIP-50
full-text search: a typed query model that separates terms from
extensions, a local relevance score, and relevance-ordered results.
+214
View File
@@ -0,0 +1,214 @@
# Search
NIP-50 adds one field to the filter from the previous chapter: a `search`
string. A relay that advertises the capability reads the string as a
human-readable query — `best nostr apps` — matches it against event content,
and returns results ordered by relevance rather than by `created_at`, with
`limit` applied after ranking.
Search is opt-in and implementation-defined. Relays decide whether they index events
at all, what matches, and how ranking works. The query may also carry
`key:value` extensions — `domain:`, `language:`, `sentiment:`, `nsfw:`,
`include:spam` — and a relay honors only the ones it understands, ignoring the
rest. There is no global index and no guarantee of completeness: a client
queries the relays it believes support search and accepts a partial view.
Search may be implemented relay-side, or it may be performed on a client in some
situations. This chapter provides utilities for parsing search terms along with
a very basic model for implementing search that is decoupled from filter matching
itself and entirely opt-in.
## The module
```rust {file=coracle-lib/src/lib.rs}
pub mod search;
```
```rust {file=coracle-lib/src/search.rs}
//! NIP-50 full-text search queries.
//!
//! A [`SearchQuery`] holds the terms of a search string and computes a
//! best-effort relevance score against event content — for the case where
//! search runs on the client, over events already in hand, rather than on a
//! relay.
use std::fmt;
```
## The query model
A `SearchQuery` is just the query's terms: the words split out of the search
string. NIP-50 also defines `key:value` extensions, but their meaning is
relay-defined, and the local scorer has no way to evaluate `sentiment:negative`
or `domain:example.com` without data it doesn't have. Rather than model
extensions we can't honor, we treat every token as a term. A relay that
understands an extension still sees it verbatim in the query string; the local
scorer simply matches it as text like any other word.
```rust {file=coracle-lib/src/search.rs}
/// A parsed NIP-50 search query: the terms of the query string.
///
/// NIP-50 `key:value` extensions are not modeled separately — their semantics
/// are relay-defined and cannot be evaluated locally, so each is kept as an
/// ordinary term.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct SearchQuery {
/// The query's terms, in order.
pub terms: Vec<String>,
}
```
### Parsing
Parsing splits the query on whitespace. Every token becomes a term, including
anything that looks like an extension. There is nothing to reject, so parsing is
total — it never errors.
```rust {file=coracle-lib/src/search.rs}
impl SearchQuery {
/// Create an empty query.
pub fn new() -> Self {
SearchQuery::default()
}
/// Parse a raw query string by splitting it on whitespace. Every token,
/// extension-like or not, becomes a term. Parsing never fails.
pub fn parse(input: &str) -> Self {
SearchQuery {
terms: input.split_whitespace().map(str::to_string).collect(),
}
}
/// True when the query has no terms.
pub fn is_empty(&self) -> bool {
self.terms.is_empty()
}
}
```
Rendering joins the terms back into a query string. It is the inverse of
parsing: feeding the output of one into the other gives an equal query, modulo
runs of whitespace collapsing to single spaces.
```rust {file=coracle-lib/src/search.rs}
impl fmt::Display for SearchQuery {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.terms.join(" "))
}
}
```
## Scoring
NIP-50 returns results in descending order of relevance, so a boolean "matches
or not" is the wrong shape for a local implementation. The scorer instead
returns a number in `0.0..=1.0`, which can drive both inclusion (anything above
zero is a hit) and ordering.
The score has two parts. The base is the fraction of the query's terms that
appear in the content, compared case-insensitively — three terms, two present,
gives `2/3`. On top of that, repeated occurrences add a small, diminishing
bonus, so that among events matching the same set of terms the ones that mention
them more often rank higher. The bonus is bounded below `1/total`, which means
it can reorder events *within* a fraction but can never push a partial match up
to a full one: a missing term always costs more than any number of repetitions
can recover. An empty query — no terms — scores `1.0`, since there is no text to
constrain.
```rust {file=coracle-lib/src/search.rs}
impl SearchQuery {
/// Score `content` against this query's terms, in `0.0..=1.0`.
///
/// The base score is the fraction of the query's terms found in the content
/// (case-insensitive substring). Repeated occurrences add a diminishing
/// bonus, strictly less than one term's worth, so a partial match never
/// reaches `1.0`. An empty query scores `1.0`: there is no text to match.
pub fn score(&self, content: &str) -> f64 {
let total = self.terms.len();
if total == 0 {
return 1.0;
}
let haystack = content.to_lowercase();
let mut matched = 0usize;
let mut extra = 0usize;
for term in &self.terms {
let needle = term.to_lowercase();
if needle.is_empty() {
// An empty term imposes no constraint; treat it as present.
matched += 1;
continue;
}
let count = haystack.matches(needle.as_str()).count();
if count > 0 {
matched += 1;
extra += count - 1;
}
}
let base = matched as f64 / total as f64;
let bonus = (1.0 - 1.0 / (1.0 + extra as f64)) / total as f64;
(base + bonus).min(1.0)
}
}
```
Lowercasing uses `to_lowercase`, which folds case across Unicode rather than
only ASCII. That allocates, but nostr content is multilingual, and correctness
on non-Latin text is worth more than avoiding a copy in a best-effort matcher.
## Connecting queries to filters
The previous chapter gave `Filter` a `search` field but no way to set it. The
setters follow the established `add_*` / `clear_*` vocabulary.
```rust {file=coracle-lib/src/filters.rs}
use crate::search::SearchQuery;
impl Filter {
/// Set the NIP-50 search query.
pub fn add_search(mut self, search: impl Into<String>) -> Self {
self.search = Some(search.into());
self
}
/// Remove the search query, leaving no search constraint.
pub fn clear_search(mut self) -> Self {
self.search = None;
self
}
}
```
Scoring an event against a filter is then a matter of parsing the field and
delegating to `SearchQuery::score`. With no search set the method returns `1.0`,
so an unsearched filter never penalizes an event. This is purely the search
dimension — it is independent of the structural `matches` check from the
previous chapter, and the two are meant to be composed by the caller, not folded
together. A consumer that wants search-ranked results filters with `matches`,
scores with `search_score`, and sorts as it sees fit.
```rust {file=coracle-lib/src/filters.rs}
impl Filter {
/// Best-effort local relevance score for `event`, in `0.0..=1.0`.
///
/// Parses the `search` field and scores it against the event's content,
/// returning `1.0` when there is no search. This considers *only* the
/// `search` field; it is independent of [`matches`](Filter::matches).
pub fn search_score(&self, event: &Event) -> f64 {
match &self.search {
Some(query) => SearchQuery::parse(query).score(&event.content),
None => 1.0,
}
}
}
```
## What's next
Search depends on routing the query to a relay that actually supports it.
Discovering which relays advertise NIP-50, and choosing among them, is a
networking and relay-metadata concern — the subject of the Domain and Networking
sections, where relay selection is built on top of the filter types assembled
here.
+189
View File
@@ -0,0 +1,189 @@
# Plan: Search
## Topic Summary
NIP-50 adds an optional full-text `search` field to the subscription filter from
chapter 11. A relay that supports the capability interprets the query against
event content (and, for some kinds, other fields), returning results ordered by
relevance rather than `created_at`, with `limit` applied after ranking. The
query may carry `key:value` extensions — `domain:`, `language:`, `sentiment:`,
`nsfw:`, `include:spam` — which relays may support or ignore.
This chapter extends `Filter` with a `search` field, threads it through
serialization / grouping / set algebra, introduces a typed `SearchQuery` that
splits free-text terms from `key:value` extensions, and implements a best-effort
local relevance **score in [0, 1]** used to both include and rank events —
mirroring the NIP's "descending order by quality of result, limit last."
## Chapter Outline
1. **Intro / framing** — Search as a relay-defined, optional capability; content
discovery is client-initiated routing, not a global index; results are
partial and ranked by the relay. The local matcher is an honest best-effort
fallback, not a reimplementation of relay search.
2. **The `search` field** — Add `search: Option<String>` to `Filter`; builder
methods `add_search` / `clear_search`; note it joins the derived `Hash` (so
`id()` covers it for free).
3. **Serialization** — Emit/parse a plain `"search"` key in the hand-written
serde impl, present only when `Some`.
4. **The `SearchQuery` model** — A new `search` module: terms + ordered
`key:value` extensions, `parse`, `Display`, builders, and the `Filter` bridge.
5. **Scoring & matching**`search_score` (fraction-of-terms + diminishing
frequency bonus, capped at 1.0); `matches` includes an event when score > 0;
`rank_search_results` sorts by score then `created_at` and applies `limit`.
6. **Grouping and set algebra**`search` enters `group()` (distinct searches
never merge); `union_filters` carries it through unchanged; `intersect_filters`
keeps a conflicting-search pair separate instead of fabricating a combined query.
7. **What's next** — Brief pointer to the Domain section (relay selection,
discovering NIP-50-capable relays via relay metadata, is a later concern).
## API Design
### `coracle-lib/src/filters.rs` (extends existing `Filter`)
```rust
pub struct Filter {
// ... existing fields ...
/// NIP-50 full-text search query. Relay-interpreted; see `SearchQuery`.
pub search: Option<String>,
}
impl Filter {
pub fn add_search(self, search: impl Into<String>) -> Self; // sets Some
pub fn clear_search(self) -> Self; // sets None
/// Bridge to the typed model.
pub fn add_search_query(self, query: &SearchQuery) -> Self; // = add_search(query.to_string())
pub fn search_query(&self) -> Option<SearchQuery>; // parse the field back
/// Best-effort local relevance score in [0.0, 1.0].
/// Returns 1.0 when there is no search, or a search with no free-text
/// terms (only extensions, which are unenforceable locally).
pub fn search_score(&self, event: &Event) -> f64;
}
/// Filter `events` to those matching `filter`, sort by relevance
/// (search_score desc, then created_at desc), and apply `filter.limit`.
pub fn rank_search_results<'a>(filter: &Filter, events: &'a [Event]) -> Vec<&'a Event>;
```
`matches` gains a final check: `if self.search_score(event) == 0.0 { return false }`.
Because `search_score` returns 1.0 when there is no search (or no terms), this
only rejects when a search *with terms* matched none of them — i.e. "any term
present ⇒ included."
### `coracle-lib/src/search.rs` (new module)
```rust
/// A parsed NIP-50 search query: free-text terms plus `key:value` extensions.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct SearchQuery {
pub terms: Vec<String>,
pub extensions: Vec<(String, String)>, // ordered; repeats allowed
}
impl SearchQuery {
pub fn new() -> Self;
/// Total parse: split on whitespace; a token is an extension iff it is
/// `key:value` with key in [A-Za-z0-9_-]+, non-empty value not starting
/// with '/'. Everything else is a term. Never fails.
pub fn parse(input: &str) -> Self;
pub fn add_term(self, term: impl Into<String>) -> Self;
pub fn add_extension(self, key: impl Into<String>, value: impl Into<String>) -> Self;
pub fn is_empty(&self) -> bool;
}
impl fmt::Display for SearchQuery { /* terms first, then "key:value" exts, space-joined */ }
```
`Filter::matches` / `search_score` tokenize via `SearchQuery::parse`, using only
`terms` (extensions are ignored by the local matcher).
### Scoring formula (`search_score`)
For the parsed query's distinct `terms` (case-insensitive), against
`event.content` lowercased:
- `total` = number of distinct terms; if 0 → return 1.0.
- For each term, `count` = non-overlapping occurrences in content.
- `matched` = terms with `count ≥ 1`; `extra` = (Σ count) matched (repeats
beyond the first hit of each matched term).
- `base = matched / total` (fraction of terms present, in [0, 1]).
- `bonus = (1 1/(1 + extra)) / total` (diminishing, strictly `< 1/total`, so a
partial match never reaches the next term's bucket).
- `score = (base + bonus).min(1.0)`.
Properties (asserted in tests): in [0, 1]; all terms once ⇒ 1.0; missing a term
`< 1.0`; more occurrences ⇒ ≥ score (monotonic, never exceeds 1.0); no terms
matched ⇒ exactly 0.0.
## Code Organization
- **`coracle-lib/src/filters.rs`** — add the `search` field, builders, the
serde changes, `search_score`, the `matches` check, `rank_search_results`,
and the `group()` / `intersect_filters` updates. `use crate::search::SearchQuery;`.
- **`coracle-lib/src/search.rs`** — the `SearchQuery` type. New `pub mod search;`
in `lib.rs`, placed before `filters` (filters depends on it).
- **`coracle-lib/src/prelude.rs`** — add `pub use crate::search::SearchQuery;`
(the prelude already re-exports commonly used items).
- **`coracle-lib/tests/search.rs`** — hand-written integration tests (not tangled).
## Dependencies
None new. Parsing and matching use `std` only. No FTS engine — out of scope and
against the minimal-dependency rule.
## Narrative Notes
- Open with the philosophy: search is opt-in and relay-defined; no global index;
results partial and relay-ranked. Frame the local scorer as a fallback for
in-memory/offline querying, and warn (per rust-nostr's SDK) that re-filtering a
relay's returned results client-side can wrongly drop legitimate hits — relays
rank with richer, extension-aware logic.
- Explain *why* extensions are parsed but **ignored locally**: `sentiment:`,
`domain:`, etc. require data the client doesn't have, so honoring them locally
is impossible; we keep them in the typed model for *building/inspecting*
queries, not for local evaluation.
- Justify the score model concretely: NIP-50 mandates relevance ordering, so a
boolean match is the wrong shape — a [0,1] score lets us both include
(score > 0) and rank. Walk through the fraction + diminishing-bonus formula
with a small worked example.
- For grouping: reuse the chapter-11 reasoning — two filters with different
searches can't be unioned without changing semantics, so `search` joins the
group key. Show that `union_filters` then keeps them separate automatically.
- For `intersect_filters`: explain the one structural change — `combine_pair`
returns `Option<Filter>`; a pair whose two searches differ returns `None`, and
the caller emits both filters separately rather than concatenating queries.
## Design Decisions
1. **Typed `SearchQuery`, lean/generic.** Terms + a generic ordered list of
`key:value` extensions, with `add_term`/`add_extension`. No per-extension
helpers or typed enums — keeps the surface small and forward-compatible with
relay-specific extensions. (Every reference treats search as opaque; the typed
model is our value-add.)
2. **Local relevance score in [0, 1]**, fraction-of-terms + diminishing frequency
bonus, capped at 1.0. Chosen over a boolean to model NIP-50's relevance
ordering. Extensions excluded from scoring.
3. **`matches` includes on score > 0** ("any term present"); ranking via
`rank_search_results` handles relevance + `limit`-after-sort.
4. **`search` participates in `group()`**, so `union_filters` never merges
distinct searches.
5. **`intersect_filters` keeps a conflicting-search pair separate** (combine
returns `Option`, `None` ⇒ emit both) rather than concatenating, per the
user's choice.
6. **Builder naming `add_search`/`clear_search`** to match the existing
`add_since`/`clear_since` vocabulary (not rust-nostr's `search`/`remove_search`).
7. **Unicode-aware lowercasing** (`to_lowercase`) for the local matcher rather
than ASCII-only, given multilingual nostr content; note the allocation
trade-off. Substring counting via `str::matches`.
8. **Extension parse heuristic** documented: a colon-bearing token like a URL may
be read as an extension; applications needing exact control build
`SearchQuery` field-by-field instead of parsing.
## Open Questions
- Exact wording of the frequency-bonus explanation — keep the formula in prose
light; lean on a worked example. (Resolved during writing.)
- Whether `rank_search_results` belongs as a free function (consistent with
`matches_any`/`union_filters`) — yes, free function.
+307
View File
@@ -0,0 +1,307 @@
# Research: Search
## Topic Summary
NIP-50 adds an optional full-text `search` field to the subscription filter
introduced in chapter 11. A relay that supports the capability interprets the
query string against event content (and, for some kinds, other fields),
returning results ordered by relevance rather than `created_at`. The query may
carry structured extensions in the form of `key:value` pairs — `domain:`,
`language:`, `sentiment:`, `nsfw:`, `include:spam` — which relays may support or
ignore.
The chapter will:
1. Add a `search` field to the existing `Filter` type, wiring it through
construction, serialization, hashing, grouping, and the union/intersect
utilities.
2. Introduce a typed `SearchQuery` model that splits free-text terms from
`key:value` extensions, so applications can build and inspect queries safely
instead of stringly-typed concatenation. (This is a deliberate departure
from every reference, which treats the query as an opaque string.)
3. Implement a best-effort, case-insensitive local matcher over event content,
while documenting that real ranking and extension semantics are
relay-defined.
The code lives in `coracle-lib`: the `search` field extends `filters.rs`, and
the query model gets a dedicated `search.rs` module.
## Philosophy
From `ref/building-nostr`, the framing relevant to search is that **content
discovery on nostr is client-initiated routing through relay selection**, not a
query against a global index. Searching is "knowing where to send queries." A
relay that supports NIP-50 is exercising an *optional, relay-authored
capability* — like content curation or access control — and defines its own
matching semantics, including which extensions it honors. This mirrors the NIP's
own "relays SHOULD ignore extensions they don't support."
Three principles bear directly on the chapter's voice:
- **No guaranteed completeness.** "No implementation will have a complete view
of every heuristic that is applicable" — so search results are neither global
nor exhaustive. A client queries the relays it knows support search and
accepts a partial, spontaneous view. This should be stated honestly, not hidden.
- **Indexing is the curator's responsibility, not the user's.** Authors publish
signed events; relays (or indexing services) that *want* content discoverable
maintain the index. Clients do nothing special beyond sending a `search`
filter to a search-capable relay.
- **Publicity, not privacy.** Full-text indexing makes content patterns
discoverable and gives relay operators visibility into queries. The honest
framing: search is a publicity feature.
The takeaway for our library: model `search` as a first-class but optional
filter field, keep the query structured enough that applications can reason
about it, and be candid that local matching is a best-effort approximation of a
relay-defined operation.
## Reference Implementation Analysis
### applesauce
`search` is an optional string on an extended `Filter` type
(`packages/core/src/helpers/filter.ts`): `Filter = CoreFilter & { search?: string }`,
extending nostr-tools' base type. **Opaque** — no extension parsing.
Dual-mode: relay subscriptions pass the string through verbatim; a local SQLite
backend (`packages/sqlite`) indexes content into an FTS5 table and runs
`events_search MATCH ?` with the raw string double-quote-escaped. Local
client-side `matchFilter()` **ignores** the search field entirely. Pluggable
"search content formatters" decide what gets indexed (default: `content`;
enhanced: kind-0 profile fields plus `t`/`subject`/`title`/`summary`/`d` tags).
Supports `order: "created_at" | "rank"` for FTS5 ranking. Low coupling; SQLite
is optional. No query-extension awareness anywhere.
### ndk
`search?: string` on `NDKFilter` (`core/src/subscription/index.ts:30`).
**Opaque, relay-only.** No parsing, no validation (filter-validation pipeline
skips it), no client-side matching (delegates to nostr-tools' `matchFilters`,
which ignores search). No helper functions for building search filters; callers
construct `{ search: "..." }` by hand. The field is serialized and sent to
relays as-is. No NIP-11 capability negotiation or fallback. Minimal by design.
### nostr-gadgets
Re-uses `@nostr/tools`' `Filter` type (`search?: string`). **Opaque,
relay-only.** Notably its local stores *reject* search: the in-memory store
returns an empty set if `filter.search` is present, and the RedEventStore docs
state "any filters supported (except 'search')." Provides a hardcoded
`SEARCH_RELAYS` constant (`defaults.ts`): `relay.nostr.band`, `nostr.wine`,
`relay.noswhere.com`, `relay.nos.today`. No query builders, no dynamic relay
capability detection.
### nostrlib (Go)
`Search string` on the `Filter` struct (`filter.go`), (de)serialized as a plain
`"search"` JSON key. The core `Filter.Matches` / `MatchesIgnoringTimestampConstraints`
**ignores** search — matching is delegated to eventstore backends. Key-value
backends (BoltDB, LMDB, MMM) return nothing for search queries; only the **Bleve**
backend implements real full-text search: per-document language auto-detection
(lingua-go, 22 languages), per-language analyzers, boolean query syntax
(`AND/OR/NOT`, parens, quoted phrases), NIP-27 reference extraction with 2× boost,
and case-insensitive substring validation of quoted phrases. Kind-0 profiles index
name/display_name/about; reposts unpack inner events. Khatru relay policies
`NoSearchQueries`/`RemoveSearchQueries` let operators disable search. SDK
`SearchUsers()` just sends a `Search` filter to designated user-search relays. No
NIP-50 *extension* parsing (treats `domain:x` as a regular word); a 2-char minimum
query length is enforced by Bleve.
### nostr-tools
`search?: string` on the base `Filter` (`filter.ts`). **The canonical
"defined-but-unused" implementation.** `matchFilter()`/`matchFilters()` do not
check search at all; `mergeFilters()` drops it entirely. No parsing, no
validation, no helpers, no tests for the field. Strictly a transport-layer
placeholder so applications can send search filters to relays. Minimal-deps
philosophy: search is purely a relay concern.
### rust-nostr
The most directly relevant reference (also Rust). In
`crates/nostr/src/filter.rs`:
```rust
/// A string describing a query in a human-readable form, i.e. "best nostr apps"
/// <https://github.com/nostr-protocol/nips/blob/master/50.md>
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub search: Option<String>,
```
Builder API: `search<S: Into<String>>(self, value: S) -> Self` and
`remove_search(self) -> Self` — symmetric, generic, `#[inline]`. **Opaque** (no
extension parsing).
Local matching (`search_match`):
```rust
fn search_match(&self, event: &Event) -> bool {
match &self.search {
Some(query) => event.content.as_bytes()
.windows(query.len())
.any(|window| window.eq_ignore_ascii_case(query.as_bytes())),
None => true,
}
}
```
Case-insensitive **ASCII** substring via sliding window; `None` matches
everything. Gated by a `MatchEventOptions { nip50: bool, .. }` flag (default
true). Notably, the SDK relay sets `.nip50(false)` with the comment "Skip NIP-50
matches since they may create issues and ban non-malicious relays" — i.e.
client-side re-matching of a relay's search results can wrongly drop valid hits.
DB backends (LMDB, SQLite) extend matching to a fixed set of searchable tags —
`title`, `description`, `subject`, `name` — lowercasing the query once up front;
empty search → no results. A `Features { full_text_search: bool }` flag declares
backend capability.
Patterns worth emulating: `Into<String>` builder, `skip_serializing_if` for a
clean wire format, an explicit opt-out for search matching, ASCII case folding
for speed.
### welshman
The TypeScript toolkit our library descends from. `search?: string` on `Filter`
(`packages/util/src/Filters.ts`). It is the **only reference that matches search
locally and threads it through filter utilities**:
```typescript
export const matchFilter = (filter, event) => {
if (!nostrToolsMatchFilter(filter, event)) return false
if (filter.search) {
const content = event.content.toLowerCase()
const terms = filter.search.toLowerCase().split(/\s+/g)
for (const term of terms) {
if (content.includes(term)) return true
return false // <-- bug: returns after first term
}
}
return true
}
```
The intent is term-splitting + case-insensitive substring, but the early
`return false` means only the first term is ever checked. **A correct version
should decide AND vs OR across terms explicitly** — this is the one place we can
clearly improve on the reference.
Filter utilities (directly parallel to our `group`/`union_filters`/`intersect_filters`):
- `calculateFilterGroup` pushes `search:${search}` into the group key — **a
filter with a search is only mergeable with an identical search.**
- `unionFilters` treats `search` (like `since`/`until`/`limit`) as a scalar
preserved from the first filter in the group, **not merged**.
- `intersectFilters` concatenates differing searches with a space
(`[a, b].join(" ")`) — modeling "must match both" as a compound query — and
takes whichever is present otherwise.
- `getFilterId` includes search in the deterministic hash, so different searches
never dedupe.
Search-relay selection lives in the router: `getSearchRelays()` returns relays
whose NIP-11 `supported_nips` includes `"50"`. No extension parsing.
## Common Patterns
- **`search` is universally an optional plain string.** Every reference models
it as `Option<String>` / `search?: string`. None parse the `key:value`
extensions — they treat the whole query as opaque and let the relay interpret
it. Our typed `SearchQuery` is therefore a value-add, not a port.
- **Local matching is the exception, not the rule.** nostr-tools, ndk,
applesauce (in `matchFilter`), and nostrlib's core `Filter` all *ignore*
search locally; matching happens relay-side (or in a dedicated index like
Bleve/FTS5). Only rust-nostr and welshman attempt local matching, both with
case-insensitive substring over `content`.
- **Where matching exists, it's case-insensitive substring** — rust-nostr does
ASCII-only `eq_ignore_ascii_case` over byte windows (whole query as one
needle); welshman lowercases and splits on whitespace into terms (intending
multi-term, buggily). DB backends additionally search a small fixed set of
metadata tags (`title`, `description`, `subject`, `name`).
- **Search makes filters un-mergeable.** Both welshman (group key) and the
general intuition agree: two filters with different search strings can't be
unioned without changing semantics. rust-nostr sidesteps merging at this layer
entirely.
- **Client-side re-matching is risky.** rust-nostr's SDK disables NIP-50
matching when filtering relay results, because a relay's notion of a match
(ranked, fuzzy, multi-field, extension-aware) is richer than a client's
substring check — re-filtering can drop legitimate hits.
- **Relay selection by NIP-11.** Search-capable relays are discovered via
`supported_nips` containing `50` (welshman) or a hardcoded allowlist
(nostr-gadgets). This is an application/networking concern, out of scope for
`coracle-lib`.
## Considerations for Our Implementation
**Filter field.** Add `pub search: Option<String>` to `Filter`. Follow
rust-nostr: `add_search<S: Into<String>>(self, S)` and `clear_search(self)` to
match the existing `add_*`/`clear_*` builder vocabulary (our methods are named
`add_since`/`clear_since`, etc., so `add_search`/`clear_search` fits better than
rust-nostr's `search`/`remove_search`). The field already participates in the
derived `Hash` (so `id()` covers it for free), but serialization, `group()`,
`union_filters`, `intersect_filters`, and `matches()` all need explicit updates.
**Serialization.** Our `Filter` has hand-written serde (to flatten `#tag` keys).
Add `search` as a plain `"search"` key — emit only when `Some` (mirroring
`since`/`until`/`limit`), and read it in the visitor's match arm. A round-trip
test must cover it.
**Grouping / union / intersect.** Per welshman: include `search` in the
`group()` hash so filters with different searches land in different groups (never
merged). In `union_filters`, since group members share an identical search by
construction, the search carries over via the `or_insert_with(|| filter.clone())`
seed — no special merge needed, but worth a comment. In `combine_pair`
(intersect), decide how to combine two searches: welshman concatenates with a
space. Concatenation is defensible ("must match both") but lossy and surprising;
a cleaner rule for a typed model is to **merge two `SearchQuery` values** (union
their terms and extensions) or, if we keep the field as a string at this layer,
to concatenate with a space and document it. Recommend: concatenate with a space
when both present and differ, matching welshman, and note the limitation.
**Local matching.** Extend `Filter::matches` to test `search` *after* the cheap
scalar checks. Best-effort, case-insensitive. Two design choices to settle in
planning:
1. Whole-query substring (rust-nostr) vs. term-split AND/OR (welshman, fixed).
A typed `SearchQuery` makes term-split natural: match the free-text terms
(AND across terms reads as the intuitive "all words present"; document it),
and treat `key:value` extensions as *unenforceable locally* — i.e. ignored by
the local matcher, since we can't evaluate `sentiment:` or `domain:` without
external data. This honesty matches the NIP.
2. ASCII (`eq_ignore_ascii_case`) vs. Unicode lowercasing. ASCII is what
rust-nostr ships and is allocation-free; Unicode `to_lowercase` is more
correct for non-Latin content but allocates. Given nostr's multilingual
content, prefer Unicode `to_lowercase` for the local matcher — correctness
over micro-optimization, consistent with our "clarity over cleverness" rule —
and note the trade-off.
Also document, per rust-nostr's SDK, that local matching is a *fallback*:
relay results should generally be trusted as-is rather than re-filtered.
**`SearchQuery` model (new `search.rs`).** A struct splitting a query into
free-text `terms: Vec<String>` and `extensions: Vec<(String, String)>` (ordered;
NIP-50 doesn't forbid repeats, and order can matter to relays). Parsing: split on
whitespace, treat a token containing `:` (with a non-empty key before it) as an
extension, everything else as a term. Provide:
- `SearchQuery::parse(&str) -> SearchQuery` (total, never fails — unknown shapes
fall back to terms).
- `Display` / `to_string()` that re-renders to the wire string (terms first or
preserve order; planning to decide).
- Builder helpers: `term`, `extension`, plus typed convenience for the
spec-defined extensions (`domain`, `language`, `sentiment`, `nsfw`,
`include_spam`) — optional, decide scope in planning.
- A bridge to `Filter`: `Filter::add_search` can accept `impl Into<String>` so
both a raw string and `query.to_string()` work; optionally
`Filter::search_query()` to parse the field back out.
Keep `sentiment`/`nsfw` values as strings (or small enums) — leaning toward
strings to stay forward-compatible with relay-specific values, with named
constructors for the common cases.
**Dependencies.** None new. Parsing is plain string handling; matching uses std.
Avoid pulling in a real FTS engine — out of scope and against the
minimal-dependency rule.
**Out of scope (defer / mention only).** Real relevance ranking; relay-side
indexing; NIP-11 search-relay discovery (a networking concern); the `order`
hint from applesauce; multi-field/tag matching beyond `content` (could mention
`title`/`subject` as a possible extension but keep the matcher content-only for
clarity).
+250
View File
@@ -0,0 +1,250 @@
use coracle_lib::events::{Event, EventContent};
use coracle_lib::filters::{intersect_filters, union_filters, Filter};
use coracle_lib::keys::SecretKey;
use coracle_lib::search::SearchQuery;
fn fixed_secret() -> SecretKey {
let bytes: [u8; 32] = [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
];
SecretKey::from_hex(&hex::encode(bytes)).unwrap()
}
fn make_event(content: &str, created_at: u64) -> Event {
let sk = fixed_secret();
let hashed = EventContent::new()
.content(content)
.kind(1)
.stamp(created_at)
.own(sk.public_key())
.hash();
hashed.clone().sign(sk.sign(&hashed.id))
}
fn approx(a: f64, b: f64) -> bool {
(a - b).abs() < 1e-9
}
// --- SearchQuery parsing ---
#[test]
fn parse_splits_on_whitespace() {
let q = SearchQuery::parse("best nostr apps");
assert_eq!(q.terms, vec!["best", "nostr", "apps"]);
}
#[test]
fn parse_treats_extensions_as_terms() {
// We don't interpret NIP-50 extensions; every token is just a term.
let q = SearchQuery::parse("nostr domain:example.com language:en");
assert_eq!(q.terms, vec!["nostr", "domain:example.com", "language:en"]);
}
#[test]
fn parse_is_empty_for_blank_input() {
assert!(SearchQuery::parse(" ").is_empty());
assert!(SearchQuery::parse("").is_empty());
assert!(SearchQuery::new().is_empty());
assert!(!SearchQuery::parse("nostr").is_empty());
}
#[test]
fn display_joins_terms() {
let q = SearchQuery::parse("nostr best apps");
assert_eq!(q.to_string(), "nostr best apps");
}
#[test]
fn display_round_trips_through_parse() {
let q = SearchQuery::parse("nostr best apps language:en");
assert_eq!(SearchQuery::parse(&q.to_string()), q);
}
// --- Scoring ---
#[test]
fn score_full_match_is_one() {
let q = SearchQuery::parse("nostr apps");
assert!(approx(q.score("i love nostr and apps"), 1.0));
}
#[test]
fn score_no_match_is_zero() {
let q = SearchQuery::parse("nostr");
assert!(approx(q.score("no match here"), 0.0));
}
#[test]
fn score_partial_match_is_fraction() {
let q = SearchQuery::parse("nostr apps");
assert!(approx(q.score("only nostr here"), 0.5));
}
#[test]
fn score_is_case_insensitive() {
let q = SearchQuery::parse("NOSTR");
assert!(approx(q.score("the Nostr protocol"), 1.0));
}
#[test]
fn score_extension_like_term_matches_as_text() {
let q = SearchQuery::parse("language:en");
assert!(approx(q.score("posted with language:en today"), 1.0));
assert!(approx(q.score("no marker here"), 0.0));
}
#[test]
fn score_empty_query_is_one() {
assert!(approx(SearchQuery::parse("").score("anything at all"), 1.0));
assert!(approx(SearchQuery::new().score(""), 1.0));
}
#[test]
fn score_frequency_bonus_orders_partial_matches() {
let q = SearchQuery::parse("alpha beta");
let once = q.score("alpha only");
let many = q.score("alpha alpha alpha");
assert!(approx(once, 0.5));
assert!(many > once, "repeated term should score higher: {many} vs {once}");
assert!(many < 1.0, "a partial match must stay below a full match");
}
#[test]
fn score_never_exceeds_one() {
let q = SearchQuery::parse("nostr");
// Heavy repetition of a full match is still capped at 1.0.
assert!(approx(q.score("nostr nostr nostr nostr nostr"), 1.0));
}
#[test]
fn score_is_bounded() {
let q = SearchQuery::parse("alpha beta gamma");
for content in ["", "alpha", "alpha beta", "alpha alpha gamma gamma", "alpha beta gamma"] {
let s = q.score(content);
assert!((0.0..=1.0).contains(&s), "score {s} out of range for {content:?}");
}
}
// --- Filter integration ---
#[test]
fn add_and_clear_search() {
let f = Filter::new().add_search("nostr");
assert_eq!(f.search.as_deref(), Some("nostr"));
assert_eq!(f.clear_search().search, None);
}
#[test]
fn search_score_without_search_is_one() {
let event = make_event("anything", 1);
assert!(approx(Filter::new().search_score(&event), 1.0));
}
#[test]
fn search_score_with_search() {
let event = make_event("the nostr protocol", 1);
assert!(approx(Filter::new().add_search("nostr").search_score(&event), 1.0));
assert!(approx(Filter::new().add_search("missing").search_score(&event), 0.0));
}
#[test]
fn matches_ignores_search() {
// Structural matching and search scoring are independent.
let event = make_event("hello", 1);
let filter = Filter::new().add_kind(1).add_search("not-in-content");
assert!(filter.matches(&event), "matches must not consider the search field");
assert!(approx(filter.search_score(&event), 0.0));
}
// --- Serialization ---
#[test]
fn search_round_trips_through_json() {
let filter = Filter::new().add_kind(1).add_search("best nostr apps");
let json = serde_json::to_string(&filter).unwrap();
assert!(json.contains("\"search\":\"best nostr apps\""));
let parsed: Filter = serde_json::from_str(&json).unwrap();
assert_eq!(filter, parsed);
}
#[test]
fn search_absent_when_none() {
let json = serde_json::to_string(&Filter::new().add_kind(1)).unwrap();
assert!(!json.contains("search"));
}
// --- Grouping and set algebra ---
#[test]
fn group_distinguishes_searches() {
let f1 = Filter::new().add_kind(1).add_search("alpha");
let f2 = Filter::new().add_kind(1).add_search("beta");
let f3 = Filter::new().add_kind(1).add_search("alpha");
assert_ne!(f1.group(), f2.group());
assert_eq!(f1.group(), f3.group());
}
#[test]
fn union_keeps_different_searches_separate() {
let filters = vec![
Filter::new().add_kind(1).add_search("alpha"),
Filter::new().add_kind(1).add_search("beta"),
];
assert_eq!(union_filters(&filters).len(), 2);
}
#[test]
fn union_merges_same_search() {
let a = SecretKey::generate().public_key();
let b = SecretKey::generate().public_key();
let filters = vec![
Filter::new().add_search("x").add_author(a),
Filter::new().add_search("x").add_author(b),
];
let result = union_filters(&filters);
assert_eq!(result.len(), 1);
assert_eq!(result[0].search.as_deref(), Some("x"));
let authors = result[0].authors.as_ref().unwrap();
assert!(authors.contains(&a) && authors.contains(&b));
}
#[test]
fn intersect_keeps_conflicting_searches_separate() {
let groups = vec![
vec![Filter::new().add_search("nostr")],
vec![Filter::new().add_search("bitcoin")],
];
let result = intersect_filters(&groups);
assert_eq!(result.len(), 2);
let searches: std::collections::BTreeSet<_> =
result.iter().map(|f| f.search.clone()).collect();
assert!(searches.contains(&Some("nostr".to_string())));
assert!(searches.contains(&Some("bitcoin".to_string())));
}
#[test]
fn intersect_combines_when_one_side_has_search() {
let author = SecretKey::generate().public_key();
let groups = vec![
vec![Filter::new().add_author(author)],
vec![Filter::new().add_search("nostr")],
];
let result = intersect_filters(&groups);
assert_eq!(result.len(), 1);
assert_eq!(result[0].search.as_deref(), Some("nostr"));
assert!(result[0].authors.as_ref().unwrap().contains(&author));
}
#[test]
fn intersect_combines_equal_searches() {
let groups = vec![
vec![Filter::new().add_kind(1).add_search("x")],
vec![Filter::new().add_kind(2).add_search("x")],
];
let result = intersect_filters(&groups);
assert_eq!(result.len(), 1);
assert_eq!(result[0].search.as_deref(), Some("x"));
let kinds = result[0].kinds.as_ref().unwrap();
assert!(kinds.contains(&1) && kinds.contains(&2));
}