1
Fork 0
mirror of https://github.com/Steffo99/patched-porobot.git synced 2025-01-08 17:49:46 +00:00

Refactor search::cardsearch module

This commit is contained in:
Steffo 2022-08-07 05:37:43 +02:00
parent d6cb0f8b56
commit 2da1264681
Signed by: steffo
GPG key ID: 6965406171929D01

View file

@ -1,281 +1,372 @@
//! This module configures [tantivy] structs for [Card] search. //! This module defines a [tantivy] search engine to find [Card]s.
use tantivy::{Document, Index, IndexReader, IndexWriter, TantivyError}; use tantivy::{Document, Index, IndexReader, IndexWriter};
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::{QueryParser, QueryParserError}; use tantivy::query::{QueryParser, QueryParserError};
use tantivy::schema::{Schema, TextOptions}; use tantivy::schema::{Field, NumericOptions, Schema, TextOptions};
use tantivy::tokenizer::TextAnalyzer; use tantivy::tokenizer::TextAnalyzer;
use itertools::Itertools; use itertools::Itertools;
use crate::data::corebundle::globals::LocalizedGlobalsIndexes; use crate::data::corebundle::globals::LocalizedGlobalsIndexes;
use crate::data::setbundle::r#type::CardType; use crate::data::setbundle::card::{Card, CardIndex};
use crate::data::setbundle::card::Card;
/// Create a new [tantivy::tokenizer::TextAnalyzer] for card text. /// The search engine.
/// ///
/// It should not alter text significantly, as it may contain important game vocabulary terms. /// To create a new engine, use [CardSearchEngine::new].
pub fn card_tokenizer() -> TextAnalyzer { ///
use tantivy::tokenizer::*; /// A separate search engine should be created for every locale.
pub struct CardSearchEngine {
/// The index of the search engine.
index: Index,
TextAnalyzer::from(SimpleTokenizer) /// Struct to read documents from the search engine.
.filter(LowerCaser) reader: IndexReader,
/// Struct to parse queries input by the user.
parser: QueryParser,
/// Localization of game globals used by the search engine.
pub globals: LocalizedGlobalsIndexes,
/// Cards searchable in the search engine.
pub cards: CardIndex
} }
/// Create a new [tantivy::schema::TextOptions] for card codes, skipping tokenization. impl CardSearchEngine {
pub fn cardcode_options() -> TextOptions { /// Create the [tantivy::tokenizer::TextAnalyzer] for card text.
use tantivy::schema::*; ///
/// It should not alter text significantly, as it may contain important game vocabulary terms.
fn tokenizer() -> TextAnalyzer {
use tantivy::tokenizer::*;
TextOptions::default() TextAnalyzer::from(SimpleTokenizer)
.set_stored() .filter(LowerCaser)
.set_fast() }
}
/// Create the [tantivy::schema::TextOptions] for card codes.
///
/// Card codes should:
/// - never be tokenized;
/// - be retrievable (what [tantivy] calls "stored").
fn options_code() -> TextOptions {
use tantivy::schema::*;
/// Create a new [tantivy::schema::TextOptions] for card keywords, using the given tokenizer. TextOptions::default()
pub fn cardkeyword_options() -> TextOptions { .set_stored()
use tantivy::schema::*; .set_fast()
}
TextOptions::default() /// Create the [tantivy::schema::TextOptions] for card keywords.
.set_indexing_options(TextFieldIndexing::default() ///
.set_tokenizer("card") /// Card keywords should:
.set_fieldnorms(false) /// - be tokenized with the [CardSearchEngine::tokenizer];
.set_index_option(IndexRecordOption::Basic) /// - ignore positioning.
fn options_keyword() -> TextOptions {
use tantivy::schema::*;
TextOptions::default()
.set_indexing_options(TextFieldIndexing::default()
.set_tokenizer("card")
.set_index_option(IndexRecordOption::Basic)
)
}
/// Create the [tantivy::schema::TextOptions] for card text fields.
///
/// Card text should:
/// - TODO: be tokenized with the tokenizer for the locale language;
/// - consider both frequency and positioning.
fn options_text() -> TextOptions {
use tantivy::schema::*;
TextOptions::default()
.set_indexing_options(TextFieldIndexing::default()
.set_tokenizer("card")
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
)
}
/// Create the [tantivy::schema::NumericOptions] for card numeric fields.
///
/// Card numbers should:
/// - be indexed.
fn options_number() -> NumericOptions {
use tantivy::schema::*;
NumericOptions::default()
.set_indexed()
}
/// Create the [Schema] for the search engine.
///
/// It will contain [Field]s with the following names:
///
/// | Name | Type |
/// |---------------|----------------------------------|
/// | `code` | [code](Self::options_code) |
/// | `name` | [text](Self::options_text) |
/// | `type` | [keyword](Self::options_keyword) |
/// | `set` | [keyword](Self::options_keyword) |
/// | `rarity` | [keyword](Self::options_keyword) |
/// | `collectible` | [number](Self::options_number) |
/// | `regions` | [keyword](Self::options_keyword) |
/// | `attack` | [number](Self::options_number) |
/// | `cost` | [number](Self::options_number) |
/// | `health` | [number](Self::options_number) |
/// | `spellspeed` | [keyword](Self::options_keyword) |
/// | `keywords` | [keyword](Self::options_keyword) |
/// | `description` | [text](Self::options_text) |
/// | `levelup` | [text](Self::options_text) |
/// | `flavor` | [text](Self::options_text) |
/// | `artist` | [text](Self::options_text) |
///
/// Use [Self::schema_fields] to create the [CardSchemaFields] object containing all of them.
///
fn schema() -> Schema {
use tantivy::schema::*;
let mut schema_builder = Schema::builder();
let options_code = Self::options_code();
let options_keyword = Self::options_keyword();
let options_text = Self::options_text();
let options_number = Self::options_number();
schema_builder.add_text_field("code", options_code);
schema_builder.add_text_field("name", options_text.clone());
schema_builder.add_text_field("type", options_keyword.clone());
schema_builder.add_text_field("set", options_keyword.clone());
schema_builder.add_text_field("rarity", options_keyword.clone());
schema_builder.add_u64_field("collectible", options_number.clone());
schema_builder.add_text_field("regions", options_keyword.clone());
schema_builder.add_u64_field("attack", options_number.clone());
schema_builder.add_u64_field("cost", options_number.clone());
schema_builder.add_u64_field("health", options_number);
schema_builder.add_text_field("spellspeed", options_keyword.clone());
schema_builder.add_text_field("keywords", options_keyword.clone());
schema_builder.add_text_field("description", options_text.clone());
schema_builder.add_text_field("levelup", options_text.clone());
schema_builder.add_text_field("flavor", options_text.clone());
schema_builder.add_text_field("artist", options_text);
schema_builder.add_text_field("subtypes", options_keyword.clone());
schema_builder.add_text_field("supertype", options_keyword);
schema_builder.build()
}
/// Create a [CardSchemaFields] object from the given schema.
fn schema_fields(schema: &Schema) -> CardSchemaFields {
CardSchemaFields {
code: schema.get_field("code").expect("schema to have a 'code' field"),
name: schema.get_field("name").expect("schema to have a 'name' field"),
r#type: schema.get_field("type").expect("schema to have a 'type' field"),
set: schema.get_field("set").expect("schema to have a 'set' field"),
rarity: schema.get_field("rarity").expect("schema to have a 'rarity' field"),
collectible: schema.get_field("collectible").expect("schema to have a 'collectible' field"),
regions: schema.get_field("regions").expect("schema to have a 'regions' field"),
attack: schema.get_field("attack").expect("schema to have a 'attack' field"),
cost: schema.get_field("cost").expect("schema to have a 'cost' field"),
health: schema.get_field("health").expect("schema to have a 'health' field"),
spellspeed: schema.get_field("spellspeed").expect("schema to have a 'spellspeed' field"),
keywords: schema.get_field("keywords").expect("schema to have a 'keywords' field"),
description: schema.get_field("description").expect("schema to have a 'description' field"),
levelup: schema.get_field("levelup").expect("schema to have a 'levelup' field"),
flavor: schema.get_field("flavor").expect("schema to have a 'flavor' field"),
artist: schema.get_field("artist").expect("schema to have a 'artist' field"),
subtypes: schema.get_field("subtypes").expect("schema to have a 'subtypes' field"),
supertype: schema.get_field("supertype").expect("schema to have a 'supertype' field"),
}
}
/// Build [in RAM](Index::create_in_ram) the [Index] of the search engine.
fn index() -> Index {
Index::create_in_ram(
Self::schema()
) )
} }
/// Build a [IndexWriter] with the optimal configuration for the search engine.
///
/// Uses 12 MB of RAM; do not lower below 3 MB, or it will panic!
fn writer(index: &Index) -> IndexWriter {
index
.writer(12_000_000)
.expect("to be able to create a IndexWriter")
}
/// Create a new [tantivy::schema::TextOptions] for card text fields, using the given tokenizer. /// Build a [IndexReader] with the optimal configuration for the search engine.
pub fn cardtext_options() -> TextOptions { fn reader(index: &Index) -> IndexReader {
use tantivy::schema::*; index
.reader_builder()
.reload_policy(tantivy::ReloadPolicy::Manual)
.try_into()
.expect("to be able to create a IndexReader")
}
TextOptions::default() /// Create a [Document] from a [Card].
.set_indexing_options(TextFieldIndexing::default() fn document(fields: &CardSchemaFields, globals: &LocalizedGlobalsIndexes, card: Card) -> Document {
.set_tokenizer("card") use tantivy::doc;
.set_fieldnorms(true)
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
)
}
doc!(
/// Create a new [tantivy::schema::Schema] using [Card]s as documents. fields.code => card.code,
pub fn card_schema() -> Schema { fields.name => card.name,
use tantivy::schema::*; fields.r#type => String::from(card.r#type),
fields.set => card.set
let mut schema_builder = Schema::builder(); .localized(&globals.sets)
.map(|cs| cs.name.to_owned())
let cardcode: TextOptions = cardcode_options(); .unwrap_or_else(String::new),
let cardkeyword: TextOptions = cardkeyword_options(); fields.rarity => card.rarity
let cardtext: TextOptions = cardtext_options(); .localized(&globals.rarities)
schema_builder.add_text_field("code", cardcode);
schema_builder.add_text_field("name", cardtext.clone());
schema_builder.add_text_field("type", cardkeyword.clone());
schema_builder.add_text_field("set", cardkeyword.clone());
schema_builder.add_text_field("rarity", cardkeyword.clone());
schema_builder.add_u64_field("collectible", INDEXED);
schema_builder.add_text_field("regions", cardkeyword.clone());
schema_builder.add_u64_field("attack", INDEXED);
schema_builder.add_u64_field("cost", INDEXED);
schema_builder.add_u64_field("health", INDEXED);
schema_builder.add_text_field("spellspeed", cardkeyword.clone());
schema_builder.add_text_field("keywords", cardkeyword.clone());
schema_builder.add_text_field("description", cardtext.clone());
schema_builder.add_text_field("levelup", cardtext.clone());
schema_builder.add_text_field("associated", cardtext.clone());
schema_builder.add_text_field("flavor", cardtext.clone());
schema_builder.add_text_field("artist", cardtext);
schema_builder.add_text_field("subtypes", cardkeyword.clone());
schema_builder.add_text_field("supertype", cardkeyword);
schema_builder.build()
}
/// Create a new [tantivy::Document] using a [Card] in a specific [locale](MappedGlobals] as base.
pub fn card_to_document(schema: &Schema, globals: &LocalizedGlobalsIndexes, card: Card) -> Document {
use tantivy::*;
let f_code = schema.get_field("code").expect("schema to have a 'code' field");
let f_name = schema.get_field("name").expect("schema to have a 'name' field");
let f_type = schema.get_field("type").expect("schema to have a 'type' field");
let f_set = schema.get_field("set").expect("schema to have a 'set' field");
let f_rarity = schema.get_field("rarity").expect("schema to have a 'rarity' field");
let f_collectible = schema.get_field("collectible").expect("schema to have a 'collectible' field");
let f_regions = schema.get_field("regions").expect("schema to have a 'regions' field");
let f_attack = schema.get_field("attack").expect("schema to have a 'attack' field");
let f_cost = schema.get_field("cost").expect("schema to have a 'cost' field");
let f_health = schema.get_field("health").expect("schema to have a 'health' field");
let f_spellspeed = schema.get_field("spellspeed").expect("schema to have a 'spellspeed' field");
let f_keywords = schema.get_field("keywords").expect("schema to have a 'keywords' field");
let f_description = schema.get_field("description").expect("schema to have a 'description' field");
let f_levelup = schema.get_field("levelup").expect("schema to have a 'levelup' field");
let f_associated = schema.get_field("associated").expect("schema to have a 'associated' field");
let f_flavor = schema.get_field("flavor").expect("schema to have a 'flavor' field");
let f_artist = schema.get_field("artist").expect("schema to have a 'artist' field");
let f_subtypes = schema.get_field("subtypes").expect("schema to have a 'subtypes' field");
let f_supertype = schema.get_field("supertype").expect("schema to have a 'supertype' field");
let c_type = match card.r#type {
CardType::Spell => "Spell",
CardType::Unit => "Unit",
CardType::Ability => "Ability",
CardType::Landmark => "Landmark",
CardType::Trap => "Trap",
CardType::Unsupported => "Unknown",
};
doc!(
f_code => card.code,
f_name => card.name,
f_type => c_type,
f_set => card.set
.localized(&globals.sets)
.map(|cs| cs.name.to_owned())
.unwrap_or_else(String::new),
f_rarity => card.rarity
.localized(&globals.rarities)
.map(|cr| cr.name.to_owned())
.unwrap_or_else(String::new),
f_collectible => if card.collectible {1u64} else {0u64},
f_regions => card.regions.iter()
.map(|region| region
.localized(&globals.regions)
.map(|cr| cr.name.to_owned()) .map(|cr| cr.name.to_owned())
.unwrap_or_else(String::new) .unwrap_or_else(String::new),
).join(" "), fields.collectible => if card.collectible {1u64} else {0u64},
f_attack => card.attack, fields.regions => card.regions.iter()
f_cost => card.cost, .map(|region| region
f_health => card.health, .localized(&globals.regions)
f_spellspeed => card.spell_speed .map(|cr| cr.name.to_owned())
.localized(&globals.spell_speeds) .unwrap_or_else(String::new)
.map(|ss| ss.name.to_owned()) ).join(" "),
.unwrap_or_else(String::new), fields.attack => card.attack,
f_keywords => card.keywords.iter() fields.cost => card.cost,
.map(|keyword| keyword fields.health => card.health,
.localized(&globals.keywords) fields.spellspeed => card.spell_speed
.map(|ck| ck.name.to_owned()) .localized(&globals.spell_speeds)
.unwrap_or_else(String::new)) .map(|ss| ss.name.to_owned())
.join(" "), .unwrap_or_else(String::new),
f_description => card.localized_description_text, fields.keywords => card.keywords.iter()
f_levelup => card.localized_levelup_text, .map(|keyword| keyword
f_associated => card.associated_card_codes.join(" "), .localized(&globals.keywords)
f_flavor => card.localized_flavor_text, .map(|ck| ck.name.to_owned())
f_artist => card.artist_name, .unwrap_or_else(String::new))
f_subtypes => card.subtypes.join(" "), .join(" "),
f_supertype => card.supertype, fields.description => card.localized_description_text,
) fields.levelup => card.localized_levelup_text,
fields.flavor => card.localized_flavor_text,
fields.artist => card.artist_name,
fields.subtypes => card.subtypes.join(" "),
fields.supertype => card.supertype,
)
}
/// Build the [tantivy::QueryParser] of the search engine.
fn parser(index: &Index, fields: CardSchemaFields) -> QueryParser {
QueryParser::for_index(
&index,
Vec::from(fields)
)
}
/// Create a new [CardSearchEngine].
pub fn new(globals: LocalizedGlobalsIndexes, cards: CardIndex) -> Self {
let index = Self::index();
let schema = index.schema();
let fields = Self::schema_fields(&schema);
index.tokenizers().register("card", Self::tokenizer());
let mut writer = Self::writer(&index);
for card in cards.values() {
let document = Self::document(&fields, &globals, card.clone());
writer.add_document(document)
.expect("IndexWriter threads to not panic or die before adding a document");
};
writer.commit()
.expect("IndexWriter threads to not panic or die before commit");
let parser = Self::parser(&index, fields);
let reader = Self::reader(&index);
Self {index, reader, parser, globals, cards}
}
/// Perform a query on the search engine.
pub fn query(&self, input: &str, top: usize) -> Result<Vec<&Card>, QueryParserError> {
let searcher = self.reader.searcher();
let query = self.parser.parse_query(input)?;
let search = searcher.search(&*query, &TopDocs::with_limit(top))
.expect("Searcher::search to never fail");
let f_code = self.index.schema().get_field("code")
.expect("schema to have a 'code' field");
let results = search.iter()
.filter_map(|(_score, address)| searcher.doc(address.to_owned()).ok())
.filter_map(|doc| doc.get_first(f_code).cloned())
.filter_map(|field| field.as_text().map(String::from))
.filter_map(|code| self.cards.get(&*code))
.collect_vec();
Ok(results)
}
} }
/// Stage all [tantivy::Document]s generated from [Card]s contained in the passed [Vec] for write on a [tantivy::Index] via the given [tantivy::IndexWriter]. /// Struct containing all retrieved [CardSearchEngine] [Field]s.
pub fn cards_to_index(writer: IndexWriter, schema: Schema, globals: &LocalizedGlobalsIndexes, cards: Vec<Card>) -> tantivy::Result<()> { ///
for card in cards { /// This makes it easier to pass them around without having to re-fetch them every time they are used.
writer.add_document(card_to_document(&schema, &globals, card))?; struct CardSchemaFields {
}; /// [Card::code].
Ok(()) pub code: Field,
/// [Card::name].
pub name: Field,
/// English [Card::type].
pub r#type: Field,
/// Localized [Card::set].
pub set: Field,
/// Localized [Card::rarity].
pub rarity: Field,
/// `0` if the card is not [Card::collectible], `1` otherwise.
pub collectible: Field,
/// Space-separated localized [Card::regions].
pub regions: Field,
/// [Card::attack].
pub attack: Field,
/// [Card::cost].
pub cost: Field,
/// [Card::health].
pub health: Field,
/// [Card::spell_speed].
pub spellspeed: Field,
/// Space-separated localized [Card::keywords].
pub keywords: Field,
/// [Card::localized_description_text].
pub description: Field,
/// [Card::localized_levelup_text].
pub levelup: Field,
/// [Card::localized_flavor_text].
pub flavor: Field,
/// [Card::artist_name].
pub artist: Field,
/// Space-separated [Card::subtypes].
pub subtypes: Field,
/// [Card::supertype].
pub supertype: Field,
} }
impl From<CardSchemaFields> for Vec<Field> {
/// Build a new [tantivy::Index] for [crate::schena::setbundle::Card] documents, based on [card_schema]. fn from(fields: CardSchemaFields) -> Self {
pub(crate) fn card_index() -> Index {
Index::create_in_ram(
card_schema()
)
}
/// Build a [tantivy::IndexWriter] with the optimal configuration for [crate::schena::setbundle::Card] documents.
pub(crate) fn card_writer(index: &Index) -> IndexWriter {
index
.writer(4_000_000)
.expect("to be able to allocate 4 MB for a IndexWriter")
}
/// Build a [tantivy::IndexReader] with the optimal configuration for [crate::schena::setbundle::Card] documents.
pub(crate) fn card_reader(index: &Index) -> IndexReader {
index
.reader_builder()
.reload_policy(tantivy::ReloadPolicy::Manual)
.try_into()
.expect("to be able to create a IndexReader")
}
/// Build a new [tantivy::QueryParser] for [Card] documents, based on [crate::search::card::card_schema] and the passed index.
pub(crate) fn card_query_parser(index: &Index) -> QueryParser {
let schema = index.schema();
let f_code = schema.get_field("code").expect("schema to have a 'code' field");
let f_name = schema.get_field("name").expect("schema to have a 'name' field");
let f_type = schema.get_field("type").expect("schema to have a 'type' field");
let f_set = schema.get_field("set").expect("schema to have a 'set' field");
let f_rarity = schema.get_field("rarity").expect("schema to have a 'rarity' field");
let f_collectible = schema.get_field("collectible").expect("schema to have a 'collectible' field");
let f_regions = schema.get_field("regions").expect("schema to have a 'regions' field");
let f_attack = schema.get_field("attack").expect("schema to have a 'attack' field");
let f_cost = schema.get_field("cost").expect("schema to have a 'cost' field");
let f_health = schema.get_field("health").expect("schema to have a 'health' field");
let f_spellspeed = schema.get_field("spellspeed").expect("schema to have a 'spellspeed' field");
let f_keywords = schema.get_field("keywords").expect("schema to have a 'keywords' field");
let f_description = schema.get_field("description").expect("schema to have a 'description' field");
let f_levelup = schema.get_field("levelup").expect("schema to have a 'levelup' field");
let f_associated = schema.get_field("associated").expect("schema to have a 'associated' field");
let f_flavor = schema.get_field("flavor").expect("schema to have a 'flavor' field");
let f_artist = schema.get_field("artist").expect("schema to have a 'artist' field");
let f_subtypes = schema.get_field("subtypes").expect("schema to have a 'subtypes' field");
let f_supertype = schema.get_field("supertype").expect("schema to have a 'supertype' field");
QueryParser::for_index(
&index,
vec![ vec![
f_code, fields.code,
f_name, fields.name,
f_type, fields.r#type,
f_set, fields.set,
f_rarity, fields.rarity,
f_collectible, fields.collectible,
f_regions, fields.regions,
f_attack, fields.attack,
f_cost, fields.cost,
f_health, fields.health,
f_spellspeed, fields.spellspeed,
f_keywords, fields.keywords,
f_description, fields.description,
f_levelup, fields.levelup,
f_associated, fields.flavor,
f_flavor, fields.artist,
f_artist, fields.subtypes,
f_subtypes, fields.supertype,
f_supertype,
] ]
) }
}
pub(crate) enum CardQueryError {
Parsing(QueryParserError),
Search(TantivyError),
}
pub(crate) fn card_query(schema: &Schema, reader: &IndexReader, parser: &QueryParser, query: &str, amount: usize) -> Result<Vec<String>, CardQueryError> {
log::debug!("Searching for `{}`...", &query);
let searcher = reader.searcher();
let query = parser.parse_query(query)
.map_err(CardQueryError::Parsing)?;
let search = searcher.search(&*query, &TopDocs::with_limit(amount))
.map_err(CardQueryError::Search)?;
let f_code = schema.get_field("code").expect("schema to have a 'code' field");
let results = search.iter()
.filter_map(|(_score, address)| searcher.doc(address.to_owned()).ok())
.filter_map(|doc| doc.get_first(f_code).cloned())
.filter_map(|field| field.as_text().map(String::from))
.collect_vec();
Ok(results)
} }