Add search chapter
This commit is contained in:
+54
-9
@@ -83,6 +83,10 @@ pub struct Filter {
|
|||||||
/// Maximum number of events a consumer should return. This is not a
|
/// Maximum number of events a consumer should return. This is not a
|
||||||
/// matching criterion — [`matches`](Filter::matches) ignores it.
|
/// matching criterion — [`matches`](Filter::matches) ignores it.
|
||||||
pub limit: Option<usize>,
|
pub limit: Option<usize>,
|
||||||
|
/// NIP-50 full-text search query. The string is opaque at this point:
|
||||||
|
/// its structure and local relevance scoring are the subject of the next
|
||||||
|
/// chapter. `None` means no search constraint.
|
||||||
|
pub search: Option<String>,
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -91,6 +95,10 @@ the struct. But it is a result-count constraint for consumers (relays,
|
|||||||
storage engines), not a predicate over individual events. The `matches`
|
storage engines), not a predicate over individual events. The `matches`
|
||||||
method ignores it entirely.
|
method ignores it entirely.
|
||||||
|
|
||||||
|
The `search` field comes from NIP-50. At this point it is just a string
|
||||||
|
carried alongside the other fields; the [Search](12-search.md) chapter
|
||||||
|
gives it meaning — a query model and a local relevance score.
|
||||||
|
|
||||||
## Matching
|
## Matching
|
||||||
|
|
||||||
Matching walks each present field and returns `false` as soon as one
|
Matching walks each present field and returns `false` as soon as one
|
||||||
@@ -190,6 +198,7 @@ impl Filter {
|
|||||||
since: None,
|
since: None,
|
||||||
until: None,
|
until: None,
|
||||||
limit: None,
|
limit: None,
|
||||||
|
search: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -446,6 +455,9 @@ impl Serialize for Filter {
|
|||||||
if self.limit.is_some() {
|
if self.limit.is_some() {
|
||||||
count += 1;
|
count += 1;
|
||||||
}
|
}
|
||||||
|
if self.search.is_some() {
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
let mut map = serializer.serialize_map(Some(count))?;
|
let mut map = serializer.serialize_map(Some(count))?;
|
||||||
|
|
||||||
@@ -477,6 +489,9 @@ impl Serialize for Filter {
|
|||||||
if let Some(limit) = self.limit {
|
if let Some(limit) = self.limit {
|
||||||
map.serialize_entry("limit", &limit)?;
|
map.serialize_entry("limit", &limit)?;
|
||||||
}
|
}
|
||||||
|
if let Some(search) = &self.search {
|
||||||
|
map.serialize_entry("search", search)?;
|
||||||
|
}
|
||||||
|
|
||||||
map.end()
|
map.end()
|
||||||
}
|
}
|
||||||
@@ -511,6 +526,7 @@ impl<'de> Visitor<'de> for FilterVisitor {
|
|||||||
let mut since: Option<u64> = None;
|
let mut since: Option<u64> = None;
|
||||||
let mut until: Option<u64> = None;
|
let mut until: Option<u64> = None;
|
||||||
let mut limit: Option<usize> = None;
|
let mut limit: Option<usize> = None;
|
||||||
|
let mut search: Option<String> = None;
|
||||||
|
|
||||||
while let Some(key) = map.next_key::<String>()? {
|
while let Some(key) = map.next_key::<String>()? {
|
||||||
match key.as_str() {
|
match key.as_str() {
|
||||||
@@ -544,6 +560,7 @@ impl<'de> Visitor<'de> for FilterVisitor {
|
|||||||
"since" => since = Some(map.next_value()?),
|
"since" => since = Some(map.next_value()?),
|
||||||
"until" => until = Some(map.next_value()?),
|
"until" => until = Some(map.next_value()?),
|
||||||
"limit" => limit = Some(map.next_value()?),
|
"limit" => limit = Some(map.next_value()?),
|
||||||
|
"search" => search = Some(map.next_value()?),
|
||||||
other if other.starts_with('#') => {
|
other if other.starts_with('#') => {
|
||||||
let tag_name = other[1..].to_string();
|
let tag_name = other[1..].to_string();
|
||||||
let values: Vec<String> = map.next_value()?;
|
let values: Vec<String> = map.next_value()?;
|
||||||
@@ -563,6 +580,7 @@ impl<'de> Visitor<'de> for FilterVisitor {
|
|||||||
since,
|
since,
|
||||||
until,
|
until,
|
||||||
limit,
|
limit,
|
||||||
|
search,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -634,6 +652,7 @@ impl Filter {
|
|||||||
/// (structural shape)
|
/// (structural shape)
|
||||||
/// - The exact `since` and `until` values (different time windows
|
/// - The exact `since` and `until` values (different time windows
|
||||||
/// cannot be combined)
|
/// cannot be combined)
|
||||||
|
/// - The exact `search` query (different searches cannot be combined)
|
||||||
///
|
///
|
||||||
/// A filter with a `limit` always gets a unique group key, because
|
/// A filter with a `limit` always gets a unique group key, because
|
||||||
/// merging limited filters would change result-count semantics.
|
/// merging limited filters would change result-count semantics.
|
||||||
@@ -649,6 +668,7 @@ impl Filter {
|
|||||||
|
|
||||||
self.since.hash(&mut hasher);
|
self.since.hash(&mut hasher);
|
||||||
self.until.hash(&mut hasher);
|
self.until.hash(&mut hasher);
|
||||||
|
self.search.hash(&mut hasher);
|
||||||
|
|
||||||
if self.limit.is_some() {
|
if self.limit.is_some() {
|
||||||
// Each limited filter gets a unique group — merging two
|
// Each limited filter gets a unique group — merging two
|
||||||
@@ -669,7 +689,9 @@ and a filter on `#p` tags land in different groups — as they should,
|
|||||||
since merging them by union would change the semantics. Likewise, two
|
since merging them by union would change the semantics. Likewise, two
|
||||||
filters with different `since` or `until` values land in different
|
filters with different `since` or `until` values land in different
|
||||||
groups, because a union of their sets under one time window would either
|
groups, because a union of their sets under one time window would either
|
||||||
over-fetch or under-fetch relative to what was requested.
|
over-fetch or under-fetch relative to what was requested. The `search`
|
||||||
|
query is treated the same way: two filters with different searches can
|
||||||
|
never be merged, so each distinct search forms its own group.
|
||||||
|
|
||||||
## Union and intersection
|
## Union and intersection
|
||||||
|
|
||||||
@@ -741,8 +763,9 @@ the earliest `until`. Finally it passes the result through
|
|||||||
///
|
///
|
||||||
/// Set fields are unioned. Time windows are tightened: the latest
|
/// Set fields are unioned. Time windows are tightened: the latest
|
||||||
/// `since` and earliest `until` win. If both filters have a `limit`,
|
/// `since` and earliest `until` win. If both filters have a `limit`,
|
||||||
/// the larger one is kept. The result is simplified with
|
/// the larger one is kept. Two filters carrying *different* searches
|
||||||
/// [`union_filters`].
|
/// cannot be combined into one, so the pair is kept separate instead.
|
||||||
|
/// The result is simplified with [`union_filters`].
|
||||||
pub fn intersect_filters(groups: &[Vec<Filter>]) -> Vec<Filter> {
|
pub fn intersect_filters(groups: &[Vec<Filter>]) -> Vec<Filter> {
|
||||||
let Some(first) = groups.first() else {
|
let Some(first) = groups.first() else {
|
||||||
return vec![];
|
return vec![];
|
||||||
@@ -755,7 +778,15 @@ pub fn intersect_filters(groups: &[Vec<Filter>]) -> Vec<Filter> {
|
|||||||
|
|
||||||
for f1 in &result {
|
for f1 in &result {
|
||||||
for f2 in filters {
|
for f2 in filters {
|
||||||
combined.push(combine_pair(f1, f2));
|
match combine_pair(f1, f2) {
|
||||||
|
Some(f) => combined.push(f),
|
||||||
|
// Two different searches can't be combined into one
|
||||||
|
// filter; keep both so neither query is lost.
|
||||||
|
None => {
|
||||||
|
combined.push(f1.clone());
|
||||||
|
combined.push(f2.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -765,7 +796,16 @@ pub fn intersect_filters(groups: &[Vec<Filter>]) -> Vec<Filter> {
|
|||||||
union_filters(&result)
|
union_filters(&result)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn combine_pair(a: &Filter, b: &Filter) -> Filter {
|
fn combine_pair(a: &Filter, b: &Filter) -> Option<Filter> {
|
||||||
|
// Two different searches cannot be expressed as a single search, so
|
||||||
|
// there is no filter that satisfies both. Returning `None` tells the
|
||||||
|
// caller to keep the pair separate rather than fabricate one.
|
||||||
|
if let (Some(s1), Some(s2)) = (&a.search, &b.search) {
|
||||||
|
if s1 != s2 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let mut f = Filter::new();
|
let mut f = Filter::new();
|
||||||
|
|
||||||
f.ids = union_option_sets(&a.ids, &b.ids);
|
f.ids = union_option_sets(&a.ids, &b.ids);
|
||||||
@@ -794,7 +834,10 @@ fn combine_pair(a: &Filter, b: &Filter) -> Filter {
|
|||||||
(l, None) | (None, l) => l,
|
(l, None) | (None, l) => l,
|
||||||
};
|
};
|
||||||
|
|
||||||
f
|
// At most one search is present here (equal searches collapse to one).
|
||||||
|
f.search = a.search.clone().or_else(|| b.search.clone());
|
||||||
|
|
||||||
|
Some(f)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn union_option_sets<T: Ord + Clone>(
|
fn union_option_sets<T: Ord + Clone>(
|
||||||
@@ -815,6 +858,8 @@ fn union_option_sets<T: Ord + Clone>(
|
|||||||
|
|
||||||
## What's next
|
## What's next
|
||||||
|
|
||||||
The next chapter extends filters with NIP-50 full-text search — an
|
The `search` field rides along through serialization, grouping, and the
|
||||||
optional `search` field that some relays support for content-based
|
set algebra here, but it has no meaning yet — `matches` doesn't look at
|
||||||
queries.
|
it, and the string is uninterpreted. The next chapter takes up NIP-50
|
||||||
|
full-text search: a typed query model that separates terms from
|
||||||
|
extensions, a local relevance score, and relevance-ordered results.
|
||||||
|
|||||||
@@ -0,0 +1,214 @@
|
|||||||
|
# Search
|
||||||
|
|
||||||
|
NIP-50 adds one field to the filter from the previous chapter: a `search`
|
||||||
|
string. A relay that advertises the capability reads the string as a
|
||||||
|
human-readable query — `best nostr apps` — matches it against event content,
|
||||||
|
and returns results ordered by relevance rather than by `created_at`, with
|
||||||
|
`limit` applied after ranking.
|
||||||
|
|
||||||
|
Search is opt-in and implementation-defined. Relays decide whether they index events
|
||||||
|
at all, what matches, and how ranking works. The query may also carry
|
||||||
|
`key:value` extensions — `domain:`, `language:`, `sentiment:`, `nsfw:`,
|
||||||
|
`include:spam` — and a relay honors only the ones it understands, ignoring the
|
||||||
|
rest. There is no global index and no guarantee of completeness: a client
|
||||||
|
queries the relays it believes support search and accepts a partial view.
|
||||||
|
|
||||||
|
Search may be implemented relay-side, or it may be performed on a client in some
|
||||||
|
situations. This chapter provides utilities for parsing search terms along with
|
||||||
|
a very basic model for implementing search that is decoupled from filter matching
|
||||||
|
itself and entirely opt-in.
|
||||||
|
|
||||||
|
## The module
|
||||||
|
|
||||||
|
```rust {file=coracle-lib/src/lib.rs}
|
||||||
|
pub mod search;
|
||||||
|
```
|
||||||
|
|
||||||
|
```rust {file=coracle-lib/src/search.rs}
|
||||||
|
//! NIP-50 full-text search queries.
|
||||||
|
//!
|
||||||
|
//! A [`SearchQuery`] holds the terms of a search string and computes a
|
||||||
|
//! best-effort relevance score against event content — for the case where
|
||||||
|
//! search runs on the client, over events already in hand, rather than on a
|
||||||
|
//! relay.
|
||||||
|
|
||||||
|
use std::fmt;
|
||||||
|
```
|
||||||
|
|
||||||
|
## The query model
|
||||||
|
|
||||||
|
A `SearchQuery` is just the query's terms: the words split out of the search
|
||||||
|
string. NIP-50 also defines `key:value` extensions, but their meaning is
|
||||||
|
relay-defined, and the local scorer has no way to evaluate `sentiment:negative`
|
||||||
|
or `domain:example.com` without data it doesn't have. Rather than model
|
||||||
|
extensions we can't honor, we treat every token as a term. A relay that
|
||||||
|
understands an extension still sees it verbatim in the query string; the local
|
||||||
|
scorer simply matches it as text like any other word.
|
||||||
|
|
||||||
|
```rust {file=coracle-lib/src/search.rs}
|
||||||
|
/// A parsed NIP-50 search query: the terms of the query string.
|
||||||
|
///
|
||||||
|
/// NIP-50 `key:value` extensions are not modeled separately — their semantics
|
||||||
|
/// are relay-defined and cannot be evaluated locally, so each is kept as an
|
||||||
|
/// ordinary term.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
||||||
|
pub struct SearchQuery {
|
||||||
|
/// The query's terms, in order.
|
||||||
|
pub terms: Vec<String>,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Parsing
|
||||||
|
|
||||||
|
Parsing splits the query on whitespace. Every token becomes a term, including
|
||||||
|
anything that looks like an extension. There is nothing to reject, so parsing is
|
||||||
|
total — it never errors.
|
||||||
|
|
||||||
|
```rust {file=coracle-lib/src/search.rs}
|
||||||
|
impl SearchQuery {
|
||||||
|
/// Create an empty query.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
SearchQuery::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a raw query string by splitting it on whitespace. Every token,
|
||||||
|
/// extension-like or not, becomes a term. Parsing never fails.
|
||||||
|
pub fn parse(input: &str) -> Self {
|
||||||
|
SearchQuery {
|
||||||
|
terms: input.split_whitespace().map(str::to_string).collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True when the query has no terms.
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.terms.is_empty()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Rendering joins the terms back into a query string. It is the inverse of
|
||||||
|
parsing: feeding the output of one into the other gives an equal query, modulo
|
||||||
|
runs of whitespace collapsing to single spaces.
|
||||||
|
|
||||||
|
```rust {file=coracle-lib/src/search.rs}
|
||||||
|
impl fmt::Display for SearchQuery {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
f.write_str(&self.terms.join(" "))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scoring
|
||||||
|
|
||||||
|
NIP-50 returns results in descending order of relevance, so a boolean "matches
|
||||||
|
or not" is the wrong shape for a local implementation. The scorer instead
|
||||||
|
returns a number in `0.0..=1.0`, which can drive both inclusion (anything above
|
||||||
|
zero is a hit) and ordering.
|
||||||
|
|
||||||
|
The score has two parts. The base is the fraction of the query's terms that
|
||||||
|
appear in the content, compared case-insensitively — three terms, two present,
|
||||||
|
gives `2/3`. On top of that, repeated occurrences add a small, diminishing
|
||||||
|
bonus, so that among events matching the same set of terms the ones that mention
|
||||||
|
them more often rank higher. The bonus is bounded below `1/total`, which means
|
||||||
|
it can reorder events *within* a fraction but can never push a partial match up
|
||||||
|
to a full one: a missing term always costs more than any number of repetitions
|
||||||
|
can recover. An empty query — no terms — scores `1.0`, since there is no text to
|
||||||
|
constrain.
|
||||||
|
|
||||||
|
```rust {file=coracle-lib/src/search.rs}
|
||||||
|
impl SearchQuery {
|
||||||
|
/// Score `content` against this query's terms, in `0.0..=1.0`.
|
||||||
|
///
|
||||||
|
/// The base score is the fraction of the query's terms found in the content
|
||||||
|
/// (case-insensitive substring). Repeated occurrences add a diminishing
|
||||||
|
/// bonus, strictly less than one term's worth, so a partial match never
|
||||||
|
/// reaches `1.0`. An empty query scores `1.0`: there is no text to match.
|
||||||
|
pub fn score(&self, content: &str) -> f64 {
|
||||||
|
let total = self.terms.len();
|
||||||
|
if total == 0 {
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let haystack = content.to_lowercase();
|
||||||
|
|
||||||
|
let mut matched = 0usize;
|
||||||
|
let mut extra = 0usize;
|
||||||
|
for term in &self.terms {
|
||||||
|
let needle = term.to_lowercase();
|
||||||
|
if needle.is_empty() {
|
||||||
|
// An empty term imposes no constraint; treat it as present.
|
||||||
|
matched += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let count = haystack.matches(needle.as_str()).count();
|
||||||
|
if count > 0 {
|
||||||
|
matched += 1;
|
||||||
|
extra += count - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let base = matched as f64 / total as f64;
|
||||||
|
let bonus = (1.0 - 1.0 / (1.0 + extra as f64)) / total as f64;
|
||||||
|
(base + bonus).min(1.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Lowercasing uses `to_lowercase`, which folds case across Unicode rather than
|
||||||
|
only ASCII. That allocates, but nostr content is multilingual, and correctness
|
||||||
|
on non-Latin text is worth more than avoiding a copy in a best-effort matcher.
|
||||||
|
|
||||||
|
## Connecting queries to filters
|
||||||
|
|
||||||
|
The previous chapter gave `Filter` a `search` field but no way to set it. The
|
||||||
|
setters follow the established `add_*` / `clear_*` vocabulary.
|
||||||
|
|
||||||
|
```rust {file=coracle-lib/src/filters.rs}
|
||||||
|
use crate::search::SearchQuery;
|
||||||
|
|
||||||
|
impl Filter {
|
||||||
|
/// Set the NIP-50 search query.
|
||||||
|
pub fn add_search(mut self, search: impl Into<String>) -> Self {
|
||||||
|
self.search = Some(search.into());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remove the search query, leaving no search constraint.
|
||||||
|
pub fn clear_search(mut self) -> Self {
|
||||||
|
self.search = None;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Scoring an event against a filter is then a matter of parsing the field and
|
||||||
|
delegating to `SearchQuery::score`. With no search set the method returns `1.0`,
|
||||||
|
so an unsearched filter never penalizes an event. This is purely the search
|
||||||
|
dimension — it is independent of the structural `matches` check from the
|
||||||
|
previous chapter, and the two are meant to be composed by the caller, not folded
|
||||||
|
together. A consumer that wants search-ranked results filters with `matches`,
|
||||||
|
scores with `search_score`, and sorts as it sees fit.
|
||||||
|
|
||||||
|
```rust {file=coracle-lib/src/filters.rs}
|
||||||
|
impl Filter {
|
||||||
|
/// Best-effort local relevance score for `event`, in `0.0..=1.0`.
|
||||||
|
///
|
||||||
|
/// Parses the `search` field and scores it against the event's content,
|
||||||
|
/// returning `1.0` when there is no search. This considers *only* the
|
||||||
|
/// `search` field; it is independent of [`matches`](Filter::matches).
|
||||||
|
pub fn search_score(&self, event: &Event) -> f64 {
|
||||||
|
match &self.search {
|
||||||
|
Some(query) => SearchQuery::parse(query).score(&event.content),
|
||||||
|
None => 1.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## What's next
|
||||||
|
|
||||||
|
Search depends on routing the query to a relay that actually supports it.
|
||||||
|
Discovering which relays advertise NIP-50, and choosing among them, is a
|
||||||
|
networking and relay-metadata concern — the subject of the Domain and Networking
|
||||||
|
sections, where relay selection is built on top of the filter types assembled
|
||||||
|
here.
|
||||||
@@ -0,0 +1,189 @@
|
|||||||
|
# Plan: Search
|
||||||
|
|
||||||
|
## Topic Summary
|
||||||
|
|
||||||
|
NIP-50 adds an optional full-text `search` field to the subscription filter from
|
||||||
|
chapter 11. A relay that supports the capability interprets the query against
|
||||||
|
event content (and, for some kinds, other fields), returning results ordered by
|
||||||
|
relevance rather than `created_at`, with `limit` applied after ranking. The
|
||||||
|
query may carry `key:value` extensions — `domain:`, `language:`, `sentiment:`,
|
||||||
|
`nsfw:`, `include:spam` — which relays may support or ignore.
|
||||||
|
|
||||||
|
This chapter extends `Filter` with a `search` field, threads it through
|
||||||
|
serialization / grouping / set algebra, introduces a typed `SearchQuery` that
|
||||||
|
splits free-text terms from `key:value` extensions, and implements a best-effort
|
||||||
|
local relevance **score in [0, 1]** used to both include and rank events —
|
||||||
|
mirroring the NIP's "descending order by quality of result, limit last."
|
||||||
|
|
||||||
|
## Chapter Outline
|
||||||
|
|
||||||
|
1. **Intro / framing** — Search as a relay-defined, optional capability; content
|
||||||
|
discovery is client-initiated routing, not a global index; results are
|
||||||
|
partial and ranked by the relay. The local matcher is an honest best-effort
|
||||||
|
fallback, not a reimplementation of relay search.
|
||||||
|
2. **The `search` field** — Add `search: Option<String>` to `Filter`; builder
|
||||||
|
methods `add_search` / `clear_search`; note it joins the derived `Hash` (so
|
||||||
|
`id()` covers it for free).
|
||||||
|
3. **Serialization** — Emit/parse a plain `"search"` key in the hand-written
|
||||||
|
serde impl, present only when `Some`.
|
||||||
|
4. **The `SearchQuery` model** — A new `search` module: terms + ordered
|
||||||
|
`key:value` extensions, `parse`, `Display`, builders, and the `Filter` bridge.
|
||||||
|
5. **Scoring & matching** — `search_score` (fraction-of-terms + diminishing
|
||||||
|
frequency bonus, capped at 1.0); `matches` includes an event when score > 0;
|
||||||
|
`rank_search_results` sorts by score then `created_at` and applies `limit`.
|
||||||
|
6. **Grouping and set algebra** — `search` enters `group()` (distinct searches
|
||||||
|
never merge); `union_filters` carries it through unchanged; `intersect_filters`
|
||||||
|
keeps a conflicting-search pair separate instead of fabricating a combined query.
|
||||||
|
7. **What's next** — Brief pointer to the Domain section (relay selection,
|
||||||
|
discovering NIP-50-capable relays via relay metadata, is a later concern).
|
||||||
|
|
||||||
|
## API Design
|
||||||
|
|
||||||
|
### `coracle-lib/src/filters.rs` (extends existing `Filter`)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
pub struct Filter {
|
||||||
|
// ... existing fields ...
|
||||||
|
/// NIP-50 full-text search query. Relay-interpreted; see `SearchQuery`.
|
||||||
|
pub search: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Filter {
|
||||||
|
pub fn add_search(self, search: impl Into<String>) -> Self; // sets Some
|
||||||
|
pub fn clear_search(self) -> Self; // sets None
|
||||||
|
|
||||||
|
/// Bridge to the typed model.
|
||||||
|
pub fn add_search_query(self, query: &SearchQuery) -> Self; // = add_search(query.to_string())
|
||||||
|
pub fn search_query(&self) -> Option<SearchQuery>; // parse the field back
|
||||||
|
|
||||||
|
/// Best-effort local relevance score in [0.0, 1.0].
|
||||||
|
/// Returns 1.0 when there is no search, or a search with no free-text
|
||||||
|
/// terms (only extensions, which are unenforceable locally).
|
||||||
|
pub fn search_score(&self, event: &Event) -> f64;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Filter `events` to those matching `filter`, sort by relevance
|
||||||
|
/// (search_score desc, then created_at desc), and apply `filter.limit`.
|
||||||
|
pub fn rank_search_results<'a>(filter: &Filter, events: &'a [Event]) -> Vec<&'a Event>;
|
||||||
|
```
|
||||||
|
|
||||||
|
`matches` gains a final check: `if self.search_score(event) == 0.0 { return false }`.
|
||||||
|
Because `search_score` returns 1.0 when there is no search (or no terms), this
|
||||||
|
only rejects when a search *with terms* matched none of them — i.e. "any term
|
||||||
|
present ⇒ included."
|
||||||
|
|
||||||
|
### `coracle-lib/src/search.rs` (new module)
|
||||||
|
|
||||||
|
```rust
|
||||||
|
/// A parsed NIP-50 search query: free-text terms plus `key:value` extensions.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
||||||
|
pub struct SearchQuery {
|
||||||
|
pub terms: Vec<String>,
|
||||||
|
pub extensions: Vec<(String, String)>, // ordered; repeats allowed
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SearchQuery {
|
||||||
|
pub fn new() -> Self;
|
||||||
|
/// Total parse: split on whitespace; a token is an extension iff it is
|
||||||
|
/// `key:value` with key in [A-Za-z0-9_-]+, non-empty value not starting
|
||||||
|
/// with '/'. Everything else is a term. Never fails.
|
||||||
|
pub fn parse(input: &str) -> Self;
|
||||||
|
pub fn add_term(self, term: impl Into<String>) -> Self;
|
||||||
|
pub fn add_extension(self, key: impl Into<String>, value: impl Into<String>) -> Self;
|
||||||
|
pub fn is_empty(&self) -> bool;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for SearchQuery { /* terms first, then "key:value" exts, space-joined */ }
|
||||||
|
```
|
||||||
|
|
||||||
|
`Filter::matches` / `search_score` tokenize via `SearchQuery::parse`, using only
|
||||||
|
`terms` (extensions are ignored by the local matcher).
|
||||||
|
|
||||||
|
### Scoring formula (`search_score`)
|
||||||
|
|
||||||
|
For the parsed query's distinct `terms` (case-insensitive), against
|
||||||
|
`event.content` lowercased:
|
||||||
|
|
||||||
|
- `total` = number of distinct terms; if 0 → return 1.0.
|
||||||
|
- For each term, `count` = non-overlapping occurrences in content.
|
||||||
|
- `matched` = terms with `count ≥ 1`; `extra` = (Σ count) − matched (repeats
|
||||||
|
beyond the first hit of each matched term).
|
||||||
|
- `base = matched / total` (fraction of terms present, in [0, 1]).
|
||||||
|
- `bonus = (1 − 1/(1 + extra)) / total` (diminishing, strictly `< 1/total`, so a
|
||||||
|
partial match never reaches the next term's bucket).
|
||||||
|
- `score = (base + bonus).min(1.0)`.
|
||||||
|
|
||||||
|
Properties (asserted in tests): in [0, 1]; all terms once ⇒ 1.0; missing a term
|
||||||
|
⇒ `< 1.0`; more occurrences ⇒ ≥ score (monotonic, never exceeds 1.0); no terms
|
||||||
|
matched ⇒ exactly 0.0.
|
||||||
|
|
||||||
|
## Code Organization
|
||||||
|
|
||||||
|
- **`coracle-lib/src/filters.rs`** — add the `search` field, builders, the
|
||||||
|
serde changes, `search_score`, the `matches` check, `rank_search_results`,
|
||||||
|
and the `group()` / `intersect_filters` updates. `use crate::search::SearchQuery;`.
|
||||||
|
- **`coracle-lib/src/search.rs`** — the `SearchQuery` type. New `pub mod search;`
|
||||||
|
in `lib.rs`, placed before `filters` (filters depends on it).
|
||||||
|
- **`coracle-lib/src/prelude.rs`** — add `pub use crate::search::SearchQuery;`
|
||||||
|
(the prelude already re-exports commonly used items).
|
||||||
|
- **`coracle-lib/tests/search.rs`** — hand-written integration tests (not tangled).
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
None new. Parsing and matching use `std` only. No FTS engine — out of scope and
|
||||||
|
against the minimal-dependency rule.
|
||||||
|
|
||||||
|
## Narrative Notes
|
||||||
|
|
||||||
|
- Open with the philosophy: search is opt-in and relay-defined; no global index;
|
||||||
|
results partial and relay-ranked. Frame the local scorer as a fallback for
|
||||||
|
in-memory/offline querying, and warn (per rust-nostr's SDK) that re-filtering a
|
||||||
|
relay's returned results client-side can wrongly drop legitimate hits — relays
|
||||||
|
rank with richer, extension-aware logic.
|
||||||
|
- Explain *why* extensions are parsed but **ignored locally**: `sentiment:`,
|
||||||
|
`domain:`, etc. require data the client doesn't have, so honoring them locally
|
||||||
|
is impossible; we keep them in the typed model for *building/inspecting*
|
||||||
|
queries, not for local evaluation.
|
||||||
|
- Justify the score model concretely: NIP-50 mandates relevance ordering, so a
|
||||||
|
boolean match is the wrong shape — a [0,1] score lets us both include
|
||||||
|
(score > 0) and rank. Walk through the fraction + diminishing-bonus formula
|
||||||
|
with a small worked example.
|
||||||
|
- For grouping: reuse the chapter-11 reasoning — two filters with different
|
||||||
|
searches can't be unioned without changing semantics, so `search` joins the
|
||||||
|
group key. Show that `union_filters` then keeps them separate automatically.
|
||||||
|
- For `intersect_filters`: explain the one structural change — `combine_pair`
|
||||||
|
returns `Option<Filter>`; a pair whose two searches differ returns `None`, and
|
||||||
|
the caller emits both filters separately rather than concatenating queries.
|
||||||
|
|
||||||
|
## Design Decisions
|
||||||
|
|
||||||
|
1. **Typed `SearchQuery`, lean/generic.** Terms + a generic ordered list of
|
||||||
|
`key:value` extensions, with `add_term`/`add_extension`. No per-extension
|
||||||
|
helpers or typed enums — keeps the surface small and forward-compatible with
|
||||||
|
relay-specific extensions. (Every reference treats search as opaque; the typed
|
||||||
|
model is our value-add.)
|
||||||
|
2. **Local relevance score in [0, 1]**, fraction-of-terms + diminishing frequency
|
||||||
|
bonus, capped at 1.0. Chosen over a boolean to model NIP-50's relevance
|
||||||
|
ordering. Extensions excluded from scoring.
|
||||||
|
3. **`matches` includes on score > 0** ("any term present"); ranking via
|
||||||
|
`rank_search_results` handles relevance + `limit`-after-sort.
|
||||||
|
4. **`search` participates in `group()`**, so `union_filters` never merges
|
||||||
|
distinct searches.
|
||||||
|
5. **`intersect_filters` keeps a conflicting-search pair separate** (combine
|
||||||
|
returns `Option`, `None` ⇒ emit both) rather than concatenating, per the
|
||||||
|
user's choice.
|
||||||
|
6. **Builder naming `add_search`/`clear_search`** to match the existing
|
||||||
|
`add_since`/`clear_since` vocabulary (not rust-nostr's `search`/`remove_search`).
|
||||||
|
7. **Unicode-aware lowercasing** (`to_lowercase`) for the local matcher rather
|
||||||
|
than ASCII-only, given multilingual nostr content; note the allocation
|
||||||
|
trade-off. Substring counting via `str::matches`.
|
||||||
|
8. **Extension parse heuristic** documented: a colon-bearing token like a URL may
|
||||||
|
be read as an extension; applications needing exact control build
|
||||||
|
`SearchQuery` field-by-field instead of parsing.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- Exact wording of the frequency-bonus explanation — keep the formula in prose
|
||||||
|
light; lean on a worked example. (Resolved during writing.)
|
||||||
|
- Whether `rank_search_results` belongs as a free function (consistent with
|
||||||
|
`matches_any`/`union_filters`) — yes, free function.
|
||||||
@@ -0,0 +1,307 @@
|
|||||||
|
# Research: Search
|
||||||
|
|
||||||
|
## Topic Summary
|
||||||
|
|
||||||
|
NIP-50 adds an optional full-text `search` field to the subscription filter
|
||||||
|
introduced in chapter 11. A relay that supports the capability interprets the
|
||||||
|
query string against event content (and, for some kinds, other fields),
|
||||||
|
returning results ordered by relevance rather than `created_at`. The query may
|
||||||
|
carry structured extensions in the form of `key:value` pairs — `domain:`,
|
||||||
|
`language:`, `sentiment:`, `nsfw:`, `include:spam` — which relays may support or
|
||||||
|
ignore.
|
||||||
|
|
||||||
|
The chapter will:
|
||||||
|
|
||||||
|
1. Add a `search` field to the existing `Filter` type, wiring it through
|
||||||
|
construction, serialization, hashing, grouping, and the union/intersect
|
||||||
|
utilities.
|
||||||
|
2. Introduce a typed `SearchQuery` model that splits free-text terms from
|
||||||
|
`key:value` extensions, so applications can build and inspect queries safely
|
||||||
|
instead of stringly-typed concatenation. (This is a deliberate departure
|
||||||
|
from every reference, which treats the query as an opaque string.)
|
||||||
|
3. Implement a best-effort, case-insensitive local matcher over event content,
|
||||||
|
while documenting that real ranking and extension semantics are
|
||||||
|
relay-defined.
|
||||||
|
|
||||||
|
The code lives in `coracle-lib`: the `search` field extends `filters.rs`, and
|
||||||
|
the query model gets a dedicated `search.rs` module.
|
||||||
|
|
||||||
|
## Philosophy
|
||||||
|
|
||||||
|
From `ref/building-nostr`, the framing relevant to search is that **content
|
||||||
|
discovery on nostr is client-initiated routing through relay selection**, not a
|
||||||
|
query against a global index. Searching is "knowing where to send queries." A
|
||||||
|
relay that supports NIP-50 is exercising an *optional, relay-authored
|
||||||
|
capability* — like content curation or access control — and defines its own
|
||||||
|
matching semantics, including which extensions it honors. This mirrors the NIP's
|
||||||
|
own "relays SHOULD ignore extensions they don't support."
|
||||||
|
|
||||||
|
Three principles bear directly on the chapter's voice:
|
||||||
|
|
||||||
|
- **No guaranteed completeness.** "No implementation will have a complete view
|
||||||
|
of every heuristic that is applicable" — so search results are neither global
|
||||||
|
nor exhaustive. A client queries the relays it knows support search and
|
||||||
|
accepts a partial, spontaneous view. This should be stated honestly, not hidden.
|
||||||
|
- **Indexing is the curator's responsibility, not the user's.** Authors publish
|
||||||
|
signed events; relays (or indexing services) that *want* content discoverable
|
||||||
|
maintain the index. Clients do nothing special beyond sending a `search`
|
||||||
|
filter to a search-capable relay.
|
||||||
|
- **Publicity, not privacy.** Full-text indexing makes content patterns
|
||||||
|
discoverable and gives relay operators visibility into queries. The honest
|
||||||
|
framing: search is a publicity feature.
|
||||||
|
|
||||||
|
The takeaway for our library: model `search` as a first-class but optional
|
||||||
|
filter field, keep the query structured enough that applications can reason
|
||||||
|
about it, and be candid that local matching is a best-effort approximation of a
|
||||||
|
relay-defined operation.
|
||||||
|
|
||||||
|
## Reference Implementation Analysis
|
||||||
|
|
||||||
|
### applesauce
|
||||||
|
|
||||||
|
`search` is an optional string on an extended `Filter` type
|
||||||
|
(`packages/core/src/helpers/filter.ts`): `Filter = CoreFilter & { search?: string }`,
|
||||||
|
extending nostr-tools' base type. **Opaque** — no extension parsing.
|
||||||
|
|
||||||
|
Dual-mode: relay subscriptions pass the string through verbatim; a local SQLite
|
||||||
|
backend (`packages/sqlite`) indexes content into an FTS5 table and runs
|
||||||
|
`events_search MATCH ?` with the raw string double-quote-escaped. Local
|
||||||
|
client-side `matchFilter()` **ignores** the search field entirely. Pluggable
|
||||||
|
"search content formatters" decide what gets indexed (default: `content`;
|
||||||
|
enhanced: kind-0 profile fields plus `t`/`subject`/`title`/`summary`/`d` tags).
|
||||||
|
Supports `order: "created_at" | "rank"` for FTS5 ranking. Low coupling; SQLite
|
||||||
|
is optional. No query-extension awareness anywhere.
|
||||||
|
|
||||||
|
### ndk
|
||||||
|
|
||||||
|
`search?: string` on `NDKFilter` (`core/src/subscription/index.ts:30`).
|
||||||
|
**Opaque, relay-only.** No parsing, no validation (filter-validation pipeline
|
||||||
|
skips it), no client-side matching (delegates to nostr-tools' `matchFilters`,
|
||||||
|
which ignores search). No helper functions for building search filters; callers
|
||||||
|
construct `{ search: "..." }` by hand. The field is serialized and sent to
|
||||||
|
relays as-is. No NIP-11 capability negotiation or fallback. Minimal by design.
|
||||||
|
|
||||||
|
### nostr-gadgets
|
||||||
|
|
||||||
|
Re-uses `@nostr/tools`' `Filter` type (`search?: string`). **Opaque,
|
||||||
|
relay-only.** Notably its local stores *reject* search: the in-memory store
|
||||||
|
returns an empty set if `filter.search` is present, and the RedEventStore docs
|
||||||
|
state "any filters supported (except 'search')." Provides a hardcoded
|
||||||
|
`SEARCH_RELAYS` constant (`defaults.ts`): `relay.nostr.band`, `nostr.wine`,
|
||||||
|
`relay.noswhere.com`, `relay.nos.today`. No query builders, no dynamic relay
|
||||||
|
capability detection.
|
||||||
|
|
||||||
|
### nostrlib (Go)
|
||||||
|
|
||||||
|
`Search string` on the `Filter` struct (`filter.go`), (de)serialized as a plain
|
||||||
|
`"search"` JSON key. The core `Filter.Matches` / `MatchesIgnoringTimestampConstraints`
|
||||||
|
**ignores** search — matching is delegated to eventstore backends. Key-value
|
||||||
|
backends (BoltDB, LMDB, MMM) return nothing for search queries; only the **Bleve**
|
||||||
|
backend implements real full-text search: per-document language auto-detection
|
||||||
|
(lingua-go, 22 languages), per-language analyzers, boolean query syntax
|
||||||
|
(`AND/OR/NOT`, parens, quoted phrases), NIP-27 reference extraction with 2× boost,
|
||||||
|
and case-insensitive substring validation of quoted phrases. Kind-0 profiles index
|
||||||
|
name/display_name/about; reposts unpack inner events. Khatru relay policies
|
||||||
|
`NoSearchQueries`/`RemoveSearchQueries` let operators disable search. SDK
|
||||||
|
`SearchUsers()` just sends a `Search` filter to designated user-search relays. No
|
||||||
|
NIP-50 *extension* parsing (treats `domain:x` as a regular word); a 2-char minimum
|
||||||
|
query length is enforced by Bleve.
|
||||||
|
|
||||||
|
### nostr-tools
|
||||||
|
|
||||||
|
`search?: string` on the base `Filter` (`filter.ts`). **The canonical
|
||||||
|
"defined-but-unused" implementation.** `matchFilter()`/`matchFilters()` do not
|
||||||
|
check search at all; `mergeFilters()` drops it entirely. No parsing, no
|
||||||
|
validation, no helpers, no tests for the field. Strictly a transport-layer
|
||||||
|
placeholder so applications can send search filters to relays. Minimal-deps
|
||||||
|
philosophy: search is purely a relay concern.
|
||||||
|
|
||||||
|
### rust-nostr
|
||||||
|
|
||||||
|
The most directly relevant reference (also Rust). In
|
||||||
|
`crates/nostr/src/filter.rs`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
/// A string describing a query in a human-readable form, i.e. "best nostr apps"
|
||||||
|
/// <https://github.com/nostr-protocol/nips/blob/master/50.md>
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
#[serde(default)]
|
||||||
|
pub search: Option<String>,
|
||||||
|
```
|
||||||
|
|
||||||
|
Builder API: `search<S: Into<String>>(self, value: S) -> Self` and
|
||||||
|
`remove_search(self) -> Self` — symmetric, generic, `#[inline]`. **Opaque** (no
|
||||||
|
extension parsing).
|
||||||
|
|
||||||
|
Local matching (`search_match`):
|
||||||
|
|
||||||
|
```rust
|
||||||
|
fn search_match(&self, event: &Event) -> bool {
|
||||||
|
match &self.search {
|
||||||
|
Some(query) => event.content.as_bytes()
|
||||||
|
.windows(query.len())
|
||||||
|
.any(|window| window.eq_ignore_ascii_case(query.as_bytes())),
|
||||||
|
None => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Case-insensitive **ASCII** substring via sliding window; `None` matches
|
||||||
|
everything. Gated by a `MatchEventOptions { nip50: bool, .. }` flag (default
|
||||||
|
true). Notably, the SDK relay sets `.nip50(false)` with the comment "Skip NIP-50
|
||||||
|
matches since they may create issues and ban non-malicious relays" — i.e.
|
||||||
|
client-side re-matching of a relay's search results can wrongly drop valid hits.
|
||||||
|
DB backends (LMDB, SQLite) extend matching to a fixed set of searchable tags —
|
||||||
|
`title`, `description`, `subject`, `name` — lowercasing the query once up front;
|
||||||
|
empty search → no results. A `Features { full_text_search: bool }` flag declares
|
||||||
|
backend capability.
|
||||||
|
|
||||||
|
Patterns worth emulating: `Into<String>` builder, `skip_serializing_if` for a
|
||||||
|
clean wire format, an explicit opt-out for search matching, ASCII case folding
|
||||||
|
for speed.
|
||||||
|
|
||||||
|
### welshman
|
||||||
|
|
||||||
|
The TypeScript toolkit our library descends from. `search?: string` on `Filter`
|
||||||
|
(`packages/util/src/Filters.ts`). It is the **only reference that matches search
|
||||||
|
locally and threads it through filter utilities**:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
export const matchFilter = (filter, event) => {
|
||||||
|
if (!nostrToolsMatchFilter(filter, event)) return false
|
||||||
|
if (filter.search) {
|
||||||
|
const content = event.content.toLowerCase()
|
||||||
|
const terms = filter.search.toLowerCase().split(/\s+/g)
|
||||||
|
for (const term of terms) {
|
||||||
|
if (content.includes(term)) return true
|
||||||
|
return false // <-- bug: returns after first term
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The intent is term-splitting + case-insensitive substring, but the early
|
||||||
|
`return false` means only the first term is ever checked. **A correct version
|
||||||
|
should decide AND vs OR across terms explicitly** — this is the one place we can
|
||||||
|
clearly improve on the reference.
|
||||||
|
|
||||||
|
Filter utilities (directly parallel to our `group`/`union_filters`/`intersect_filters`):
|
||||||
|
|
||||||
|
- `calculateFilterGroup` pushes `search:${search}` into the group key — **a
|
||||||
|
filter with a search is only mergeable with an identical search.**
|
||||||
|
- `unionFilters` treats `search` (like `since`/`until`/`limit`) as a scalar
|
||||||
|
preserved from the first filter in the group, **not merged**.
|
||||||
|
- `intersectFilters` concatenates differing searches with a space
|
||||||
|
(`[a, b].join(" ")`) — modeling "must match both" as a compound query — and
|
||||||
|
takes whichever is present otherwise.
|
||||||
|
- `getFilterId` includes search in the deterministic hash, so different searches
|
||||||
|
never dedupe.
|
||||||
|
|
||||||
|
Search-relay selection lives in the router: `getSearchRelays()` returns relays
|
||||||
|
whose NIP-11 `supported_nips` includes `"50"`. No extension parsing.
|
||||||
|
|
||||||
|
## Common Patterns
|
||||||
|
|
||||||
|
- **`search` is universally an optional plain string.** Every reference models
|
||||||
|
it as `Option<String>` / `search?: string`. None parse the `key:value`
|
||||||
|
extensions — they treat the whole query as opaque and let the relay interpret
|
||||||
|
it. Our typed `SearchQuery` is therefore a value-add, not a port.
|
||||||
|
- **Local matching is the exception, not the rule.** nostr-tools, ndk,
|
||||||
|
applesauce (in `matchFilter`), and nostrlib's core `Filter` all *ignore*
|
||||||
|
search locally; matching happens relay-side (or in a dedicated index like
|
||||||
|
Bleve/FTS5). Only rust-nostr and welshman attempt local matching, both with
|
||||||
|
case-insensitive substring over `content`.
|
||||||
|
- **Where matching exists, it's case-insensitive substring** — rust-nostr does
|
||||||
|
ASCII-only `eq_ignore_ascii_case` over byte windows (whole query as one
|
||||||
|
needle); welshman lowercases and splits on whitespace into terms (intending
|
||||||
|
multi-term, buggily). DB backends additionally search a small fixed set of
|
||||||
|
metadata tags (`title`, `description`, `subject`, `name`).
|
||||||
|
- **Search makes filters un-mergeable.** Both welshman (group key) and the
|
||||||
|
general intuition agree: two filters with different search strings can't be
|
||||||
|
unioned without changing semantics. rust-nostr sidesteps merging at this layer
|
||||||
|
entirely.
|
||||||
|
- **Client-side re-matching is risky.** rust-nostr's SDK disables NIP-50
|
||||||
|
matching when filtering relay results, because a relay's notion of a match
|
||||||
|
(ranked, fuzzy, multi-field, extension-aware) is richer than a client's
|
||||||
|
substring check — re-filtering can drop legitimate hits.
|
||||||
|
- **Relay selection by NIP-11.** Search-capable relays are discovered via
|
||||||
|
`supported_nips` containing `50` (welshman) or a hardcoded allowlist
|
||||||
|
(nostr-gadgets). This is an application/networking concern, out of scope for
|
||||||
|
`coracle-lib`.
|
||||||
|
|
||||||
|
## Considerations for Our Implementation
|
||||||
|
|
||||||
|
**Filter field.** Add `pub search: Option<String>` to `Filter`. Follow
|
||||||
|
rust-nostr: `add_search<S: Into<String>>(self, S)` and `clear_search(self)` to
|
||||||
|
match the existing `add_*`/`clear_*` builder vocabulary (our methods are named
|
||||||
|
`add_since`/`clear_since`, etc., so `add_search`/`clear_search` fits better than
|
||||||
|
rust-nostr's `search`/`remove_search`). The field already participates in the
|
||||||
|
derived `Hash` (so `id()` covers it for free), but serialization, `group()`,
|
||||||
|
`union_filters`, `intersect_filters`, and `matches()` all need explicit updates.
|
||||||
|
|
||||||
|
**Serialization.** Our `Filter` has hand-written serde (to flatten `#tag` keys).
|
||||||
|
Add `search` as a plain `"search"` key — emit only when `Some` (mirroring
|
||||||
|
`since`/`until`/`limit`), and read it in the visitor's match arm. A round-trip
|
||||||
|
test must cover it.
|
||||||
|
|
||||||
|
**Grouping / union / intersect.** Per welshman: include `search` in the
|
||||||
|
`group()` hash so filters with different searches land in different groups (never
|
||||||
|
merged). In `union_filters`, since group members share an identical search by
|
||||||
|
construction, the search carries over via the `or_insert_with(|| filter.clone())`
|
||||||
|
seed — no special merge needed, but worth a comment. In `combine_pair`
|
||||||
|
(intersect), decide how to combine two searches: welshman concatenates with a
|
||||||
|
space. Concatenation is defensible ("must match both") but lossy and surprising;
|
||||||
|
a cleaner rule for a typed model is to **merge two `SearchQuery` values** (union
|
||||||
|
their terms and extensions) or, if we keep the field as a string at this layer,
|
||||||
|
to concatenate with a space and document it. Recommend: concatenate with a space
|
||||||
|
when both present and differ, matching welshman, and note the limitation.
|
||||||
|
|
||||||
|
**Local matching.** Extend `Filter::matches` to test `search` *after* the cheap
|
||||||
|
scalar checks. Best-effort, case-insensitive. Two design choices to settle in
|
||||||
|
planning:
|
||||||
|
1. Whole-query substring (rust-nostr) vs. term-split AND/OR (welshman, fixed).
|
||||||
|
A typed `SearchQuery` makes term-split natural: match the free-text terms
|
||||||
|
(AND across terms reads as the intuitive "all words present"; document it),
|
||||||
|
and treat `key:value` extensions as *unenforceable locally* — i.e. ignored by
|
||||||
|
the local matcher, since we can't evaluate `sentiment:` or `domain:` without
|
||||||
|
external data. This honesty matches the NIP.
|
||||||
|
2. ASCII (`eq_ignore_ascii_case`) vs. Unicode lowercasing. ASCII is what
|
||||||
|
rust-nostr ships and is allocation-free; Unicode `to_lowercase` is more
|
||||||
|
correct for non-Latin content but allocates. Given nostr's multilingual
|
||||||
|
content, prefer Unicode `to_lowercase` for the local matcher — correctness
|
||||||
|
over micro-optimization, consistent with our "clarity over cleverness" rule —
|
||||||
|
and note the trade-off.
|
||||||
|
|
||||||
|
Also document, per rust-nostr's SDK, that local matching is a *fallback*:
|
||||||
|
relay results should generally be trusted as-is rather than re-filtered.
|
||||||
|
|
||||||
|
**`SearchQuery` model (new `search.rs`).** A struct splitting a query into
|
||||||
|
free-text `terms: Vec<String>` and `extensions: Vec<(String, String)>` (ordered;
|
||||||
|
NIP-50 doesn't forbid repeats, and order can matter to relays). Parsing: split on
|
||||||
|
whitespace, treat a token containing `:` (with a non-empty key before it) as an
|
||||||
|
extension, everything else as a term. Provide:
|
||||||
|
- `SearchQuery::parse(&str) -> SearchQuery` (total, never fails — unknown shapes
|
||||||
|
fall back to terms).
|
||||||
|
- `Display` / `to_string()` that re-renders to the wire string (terms first or
|
||||||
|
preserve order; planning to decide).
|
||||||
|
- Builder helpers: `term`, `extension`, plus typed convenience for the
|
||||||
|
spec-defined extensions (`domain`, `language`, `sentiment`, `nsfw`,
|
||||||
|
`include_spam`) — optional, decide scope in planning.
|
||||||
|
- A bridge to `Filter`: `Filter::add_search` can accept `impl Into<String>` so
|
||||||
|
both a raw string and `query.to_string()` work; optionally
|
||||||
|
`Filter::search_query()` to parse the field back out.
|
||||||
|
|
||||||
|
Keep `sentiment`/`nsfw` values as strings (or small enums) — leaning toward
|
||||||
|
strings to stay forward-compatible with relay-specific values, with named
|
||||||
|
constructors for the common cases.
|
||||||
|
|
||||||
|
**Dependencies.** None new. Parsing is plain string handling; matching uses std.
|
||||||
|
Avoid pulling in a real FTS engine — out of scope and against the
|
||||||
|
minimal-dependency rule.
|
||||||
|
|
||||||
|
**Out of scope (defer / mention only).** Real relevance ranking; relay-side
|
||||||
|
indexing; NIP-11 search-relay discovery (a networking concern); the `order`
|
||||||
|
hint from applesauce; multi-field/tag matching beyond `content` (could mention
|
||||||
|
`title`/`subject` as a possible extension but keep the matcher content-only for
|
||||||
|
clarity).
|
||||||
@@ -0,0 +1,250 @@
|
|||||||
|
use coracle_lib::events::{Event, EventContent};
|
||||||
|
use coracle_lib::filters::{intersect_filters, union_filters, Filter};
|
||||||
|
use coracle_lib::keys::SecretKey;
|
||||||
|
use coracle_lib::search::SearchQuery;
|
||||||
|
|
||||||
|
fn fixed_secret() -> SecretKey {
|
||||||
|
let bytes: [u8; 32] = [
|
||||||
|
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
|
||||||
|
25, 26, 27, 28, 29, 30, 31, 32,
|
||||||
|
];
|
||||||
|
SecretKey::from_hex(&hex::encode(bytes)).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_event(content: &str, created_at: u64) -> Event {
|
||||||
|
let sk = fixed_secret();
|
||||||
|
let hashed = EventContent::new()
|
||||||
|
.content(content)
|
||||||
|
.kind(1)
|
||||||
|
.stamp(created_at)
|
||||||
|
.own(sk.public_key())
|
||||||
|
.hash();
|
||||||
|
hashed.clone().sign(sk.sign(&hashed.id))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn approx(a: f64, b: f64) -> bool {
|
||||||
|
(a - b).abs() < 1e-9
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- SearchQuery parsing ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_splits_on_whitespace() {
|
||||||
|
let q = SearchQuery::parse("best nostr apps");
|
||||||
|
assert_eq!(q.terms, vec!["best", "nostr", "apps"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_treats_extensions_as_terms() {
|
||||||
|
// We don't interpret NIP-50 extensions; every token is just a term.
|
||||||
|
let q = SearchQuery::parse("nostr domain:example.com language:en");
|
||||||
|
assert_eq!(q.terms, vec!["nostr", "domain:example.com", "language:en"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_is_empty_for_blank_input() {
|
||||||
|
assert!(SearchQuery::parse(" ").is_empty());
|
||||||
|
assert!(SearchQuery::parse("").is_empty());
|
||||||
|
assert!(SearchQuery::new().is_empty());
|
||||||
|
assert!(!SearchQuery::parse("nostr").is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn display_joins_terms() {
|
||||||
|
let q = SearchQuery::parse("nostr best apps");
|
||||||
|
assert_eq!(q.to_string(), "nostr best apps");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn display_round_trips_through_parse() {
|
||||||
|
let q = SearchQuery::parse("nostr best apps language:en");
|
||||||
|
assert_eq!(SearchQuery::parse(&q.to_string()), q);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Scoring ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_full_match_is_one() {
|
||||||
|
let q = SearchQuery::parse("nostr apps");
|
||||||
|
assert!(approx(q.score("i love nostr and apps"), 1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_no_match_is_zero() {
|
||||||
|
let q = SearchQuery::parse("nostr");
|
||||||
|
assert!(approx(q.score("no match here"), 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_partial_match_is_fraction() {
|
||||||
|
let q = SearchQuery::parse("nostr apps");
|
||||||
|
assert!(approx(q.score("only nostr here"), 0.5));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_is_case_insensitive() {
|
||||||
|
let q = SearchQuery::parse("NOSTR");
|
||||||
|
assert!(approx(q.score("the Nostr protocol"), 1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_extension_like_term_matches_as_text() {
|
||||||
|
let q = SearchQuery::parse("language:en");
|
||||||
|
assert!(approx(q.score("posted with language:en today"), 1.0));
|
||||||
|
assert!(approx(q.score("no marker here"), 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_empty_query_is_one() {
|
||||||
|
assert!(approx(SearchQuery::parse("").score("anything at all"), 1.0));
|
||||||
|
assert!(approx(SearchQuery::new().score(""), 1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_frequency_bonus_orders_partial_matches() {
|
||||||
|
let q = SearchQuery::parse("alpha beta");
|
||||||
|
let once = q.score("alpha only");
|
||||||
|
let many = q.score("alpha alpha alpha");
|
||||||
|
assert!(approx(once, 0.5));
|
||||||
|
assert!(many > once, "repeated term should score higher: {many} vs {once}");
|
||||||
|
assert!(many < 1.0, "a partial match must stay below a full match");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_never_exceeds_one() {
|
||||||
|
let q = SearchQuery::parse("nostr");
|
||||||
|
// Heavy repetition of a full match is still capped at 1.0.
|
||||||
|
assert!(approx(q.score("nostr nostr nostr nostr nostr"), 1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn score_is_bounded() {
|
||||||
|
let q = SearchQuery::parse("alpha beta gamma");
|
||||||
|
for content in ["", "alpha", "alpha beta", "alpha alpha gamma gamma", "alpha beta gamma"] {
|
||||||
|
let s = q.score(content);
|
||||||
|
assert!((0.0..=1.0).contains(&s), "score {s} out of range for {content:?}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Filter integration ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn add_and_clear_search() {
|
||||||
|
let f = Filter::new().add_search("nostr");
|
||||||
|
assert_eq!(f.search.as_deref(), Some("nostr"));
|
||||||
|
assert_eq!(f.clear_search().search, None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn search_score_without_search_is_one() {
|
||||||
|
let event = make_event("anything", 1);
|
||||||
|
assert!(approx(Filter::new().search_score(&event), 1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn search_score_with_search() {
|
||||||
|
let event = make_event("the nostr protocol", 1);
|
||||||
|
assert!(approx(Filter::new().add_search("nostr").search_score(&event), 1.0));
|
||||||
|
assert!(approx(Filter::new().add_search("missing").search_score(&event), 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matches_ignores_search() {
|
||||||
|
// Structural matching and search scoring are independent.
|
||||||
|
let event = make_event("hello", 1);
|
||||||
|
let filter = Filter::new().add_kind(1).add_search("not-in-content");
|
||||||
|
assert!(filter.matches(&event), "matches must not consider the search field");
|
||||||
|
assert!(approx(filter.search_score(&event), 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Serialization ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn search_round_trips_through_json() {
|
||||||
|
let filter = Filter::new().add_kind(1).add_search("best nostr apps");
|
||||||
|
let json = serde_json::to_string(&filter).unwrap();
|
||||||
|
assert!(json.contains("\"search\":\"best nostr apps\""));
|
||||||
|
let parsed: Filter = serde_json::from_str(&json).unwrap();
|
||||||
|
assert_eq!(filter, parsed);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn search_absent_when_none() {
|
||||||
|
let json = serde_json::to_string(&Filter::new().add_kind(1)).unwrap();
|
||||||
|
assert!(!json.contains("search"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Grouping and set algebra ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn group_distinguishes_searches() {
|
||||||
|
let f1 = Filter::new().add_kind(1).add_search("alpha");
|
||||||
|
let f2 = Filter::new().add_kind(1).add_search("beta");
|
||||||
|
let f3 = Filter::new().add_kind(1).add_search("alpha");
|
||||||
|
assert_ne!(f1.group(), f2.group());
|
||||||
|
assert_eq!(f1.group(), f3.group());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn union_keeps_different_searches_separate() {
|
||||||
|
let filters = vec![
|
||||||
|
Filter::new().add_kind(1).add_search("alpha"),
|
||||||
|
Filter::new().add_kind(1).add_search("beta"),
|
||||||
|
];
|
||||||
|
assert_eq!(union_filters(&filters).len(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn union_merges_same_search() {
|
||||||
|
let a = SecretKey::generate().public_key();
|
||||||
|
let b = SecretKey::generate().public_key();
|
||||||
|
let filters = vec![
|
||||||
|
Filter::new().add_search("x").add_author(a),
|
||||||
|
Filter::new().add_search("x").add_author(b),
|
||||||
|
];
|
||||||
|
let result = union_filters(&filters);
|
||||||
|
assert_eq!(result.len(), 1);
|
||||||
|
assert_eq!(result[0].search.as_deref(), Some("x"));
|
||||||
|
let authors = result[0].authors.as_ref().unwrap();
|
||||||
|
assert!(authors.contains(&a) && authors.contains(&b));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn intersect_keeps_conflicting_searches_separate() {
|
||||||
|
let groups = vec![
|
||||||
|
vec![Filter::new().add_search("nostr")],
|
||||||
|
vec![Filter::new().add_search("bitcoin")],
|
||||||
|
];
|
||||||
|
let result = intersect_filters(&groups);
|
||||||
|
assert_eq!(result.len(), 2);
|
||||||
|
let searches: std::collections::BTreeSet<_> =
|
||||||
|
result.iter().map(|f| f.search.clone()).collect();
|
||||||
|
assert!(searches.contains(&Some("nostr".to_string())));
|
||||||
|
assert!(searches.contains(&Some("bitcoin".to_string())));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn intersect_combines_when_one_side_has_search() {
|
||||||
|
let author = SecretKey::generate().public_key();
|
||||||
|
let groups = vec![
|
||||||
|
vec![Filter::new().add_author(author)],
|
||||||
|
vec![Filter::new().add_search("nostr")],
|
||||||
|
];
|
||||||
|
let result = intersect_filters(&groups);
|
||||||
|
assert_eq!(result.len(), 1);
|
||||||
|
assert_eq!(result[0].search.as_deref(), Some("nostr"));
|
||||||
|
assert!(result[0].authors.as_ref().unwrap().contains(&author));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn intersect_combines_equal_searches() {
|
||||||
|
let groups = vec![
|
||||||
|
vec![Filter::new().add_kind(1).add_search("x")],
|
||||||
|
vec![Filter::new().add_kind(2).add_search("x")],
|
||||||
|
];
|
||||||
|
let result = intersect_filters(&groups);
|
||||||
|
assert_eq!(result.len(), 1);
|
||||||
|
assert_eq!(result[0].search.as_deref(), Some("x"));
|
||||||
|
let kinds = result[0].kinds.as_ref().unwrap();
|
||||||
|
assert!(kinds.contains(&1) && kinds.contains(&2));
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user