diff --git a/.gitmodules b/.gitmodules index e9ef195d6..a7c75913f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "optics/testcases"] - path = optics/testcases/samples +[submodule "crates/optics/testcases/samples"] + path = crates/optics/testcases/samples url = https://github.com/StractOrg/sample-optics diff --git a/.vscode/settings.json b/.vscode/settings.json index 30cd3a46a..4619901a8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -23,6 +23,5 @@ "rust-analyzer.linkedProjects": [ "Cargo.toml", "optics-lsp/Cargo.toml", - "./core/Cargo.toml" ] } diff --git a/Cargo.lock b/Cargo.lock index 6ddbc6f56..ef2da2fe5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -92,6 +92,30 @@ dependencies = [ "memchr", ] +[[package]] +name = "alice" +version = "0.1.0" +dependencies = [ + "aes-gcm", + "anyhow", + "base64 0.21.4", + "bincode", + "flate2", + "half 2.3.1", + "memmap2", + "safetensors", + "serde", + "serde_json", + "stdx", + "stract-config", + "stract-llm", + "tch", + "thiserror", + "tokenizers", + "tracing", + "utoipa", +] + [[package]] name = "allocator-api2" version = "0.2.16" @@ -191,17 +215,6 @@ dependencies = [ "term", ] -[[package]] -name = "async-channel" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" -dependencies = [ - "concurrent-queue", - "event-listener", - "futures-core", -] - [[package]] name = "async-compression" version = "0.4.4" @@ -343,27 +356,6 @@ dependencies = [ "tower-service", ] -[[package]] -name = "axum-extra" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ab90e7b70bea63a153137162affb6a0bce26b584c24a4c7885509783e2cf30b" -dependencies = [ - "axum", - "axum-core", - "bytes", - "futures-util", - "http", - "http-body", - "mime", - "pin-project-lite", - "serde", - "tokio", - "tower", - "tower-layer", - "tower-service", -] - [[package]] name = "axum-macros" version = "0.3.8" @@ -788,6 +780,19 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" +[[package]] +name = "collector" +version = "0.1.0" +dependencies = [ + "min-max-heap", + "schema", + "serde", + "simhash", + "stdx", + "stract-config", + "tantivy", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -800,15 +805,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" -[[package]] -name = "concurrent-queue" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f057a694a54f12365049b0958a1685bb52d567f5593b355fbf685838e873d400" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "console" version = "0.15.7" @@ -859,6 +855,46 @@ dependencies = [ "libc", ] +[[package]] +name = "crawler" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bincode", + "chrono", + "distributed", + "encoding_rs", + "futures", + "hashbrown 0.14.1", + "hyperloglog", + "kv", + "memmap2", + "mime", + "proptest", + "quick-xml 0.30.0", + "rand 0.8.5", + "rayon", + "reqwest", + "rkyv", + "robotstxt-with-cache", + "rust-s3", + "serde", + "serde_json", + "sonic", + "stdx", + "stract-config", + "thiserror", + "tokio", + "tokio-stream", + "tracing", + "url", + "uuid", + "warc", + "webgraph", + "webpage", +] + [[package]] name = "crc32fast" version = "1.3.2" @@ -1182,6 +1218,20 @@ dependencies = [ "winapi", ] +[[package]] +name = "distributed" +version = "0.1.0" +dependencies = [ + "anyhow", + "chitchat", + "serde", + "serde_json", + "tokio", + "tokio-stream", + "tracing", + "uuid", +] + [[package]] name = "dlv-list" version = "0.3.0" @@ -1239,6 +1289,30 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "entity_index" +version = "0.1.0" +dependencies = [ + "base64 0.21.4", + "bincode", + "bzip2", + "imager", + "insta", + "itertools 0.11.0", + "kv", + "md5", + "parse_wiki_text", + "quick-xml 0.30.0", + "serde", + "stdx", + "tantivy", + "thiserror", + "tokenizer", + "tracing", + "url", + "utoipa", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1265,20 +1339,14 @@ dependencies = [ ] [[package]] -name = "event-listener" -version = "2.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" - -[[package]] -name = "eventsource-stream" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74fef4569247a5f429d9156b9d0a2599914385dd189c539334c625d8099d90ab" +name = "executor" +version = "0.1.0" dependencies = [ - "futures-core", - "nom", - "pin-project-lite", + "crossbeam-channel", + "num_cpus", + "rayon", + "thiserror", + "tracing", ] [[package]] @@ -1813,6 +1881,13 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "hyperloglog" +version = "0.1.0" +dependencies = [ + "serde", +] + [[package]] name = "iana-time-zone" version = "0.1.57" @@ -1881,6 +1956,24 @@ dependencies = [ "tiff", ] +[[package]] +name = "imager" +version = "0.1.0" +dependencies = [ + "bincode", + "distributed", + "futures", + "image", + "kv", + "reqwest", + "serde", + "stdx", + "thiserror", + "tokio", + "tracing", + "url", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -2079,6 +2172,15 @@ dependencies = [ "thiserror", ] +[[package]] +name = "kv" +version = "0.1.0" +dependencies = [ + "bincode", + "rocksdb", + "serde", +] + [[package]] name = "lalrpop" version = "0.20.0" @@ -2269,12 +2371,6 @@ dependencies = [ "hashbrown 0.14.1", ] -[[package]] -name = "lz-str" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39f3d72d77227090eed75ea331285a53726e78374a1f357ff5757702c23c70cc" - [[package]] name = "lz4-sys" version = "1.9.4" @@ -2322,6 +2418,21 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "mapreduce" +version = "0.1.0" +dependencies = [ + "async-trait", + "distributed", + "futures", + "itertools 0.11.0", + "serde", + "sonic", + "thiserror", + "tokio", + "tracing", +] + [[package]] name = "markup5ever" version = "0.11.0" @@ -2400,16 +2511,6 @@ version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "memmap2" version = "0.9.0" @@ -2513,6 +2614,16 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9380db4c04d219ac5c51d14996bbf2c2e9a15229771b53f8671eb6c83cf44df" +[[package]] +name = "naive_bayes" +version = "0.1.0" +dependencies = [ + "hashbrown 0.14.1", + "itertools 0.11.0", + "serde", + "stdx", +] + [[package]] name = "nanorand" version = "0.7.0" @@ -2785,7 +2896,6 @@ dependencies = [ name = "optics" version = "0.1.0" dependencies = [ - "itertools 0.11.0", "lalrpop", "lalrpop-util", "logos", @@ -3830,6 +3940,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "schema" +version = "0.1.0" +dependencies = [ + "stdx", + "tantivy", + "tokenizer", +] + [[package]] name = "scoped-tls" version = "1.0.1" @@ -4099,6 +4218,14 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "simhash" +version = "0.1.0" +dependencies = [ + "tantivy", + "tokenizer", +] + [[package]] name = "similar" version = "2.3.0" @@ -4172,6 +4299,37 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "sonic" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bincode", + "bytemuck", + "proptest", + "proptest-derive", + "serde", + "thiserror", + "tokio", + "tracing", +] + +[[package]] +name = "spell" +version = "0.1.0" +dependencies = [ + "fst", + "itertools 0.11.0", + "schema", + "serde", + "stdx", + "stract-query", + "tantivy", + "thiserror", + "tracing", +] + [[package]] name = "spin" version = "0.9.8" @@ -4206,16 +4364,49 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] -name = "stract" +name = "stdx" +version = "0.1.0" +dependencies = [ + "md5", + "serde", +] + +[[package]] +name = "stract-cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "serde", + "stract-config", + "stract-core", + "tikv-jemallocator", + "tokio", + "toml", + "tracing", + "tracing-subscriber", + "webgraph", +] + +[[package]] +name = "stract-config" +version = "0.1.0" +dependencies = [ + "distributed", + "rust-s3", + "serde", +] + +[[package]] +name = "stract-core" version = "0.1.0" dependencies = [ "aes-gcm", + "alice", "anyhow", - "async-channel", "async-stream", "async-trait", "axum", - "axum-extra", "axum-macros", "base64 0.21.4", "bincode", @@ -4225,13 +4416,16 @@ dependencies = [ "bzip2", "chitchat", "chrono", - "clap", + "collector", + "crawler", "criterion", "crossbeam-channel", "csv", "dashmap", + "distributed", "encoding_rs", - "eventsource-stream", + "entity_index", + "executor", "fend-core", "flate2", "fst", @@ -4239,23 +4433,25 @@ dependencies = [ "half 2.3.1", "hashbrown 0.14.1", "http", + "hyperloglog", "image", + "imager", "indicatif 0.17.7", - "insta", "itertools 0.11.0", "kuchiki", + "kv", "lalrpop", "lalrpop-util", "libc", "logos", - "lz-str", "lz4_flex", "maplit", + "mapreduce", "md5", - "memmap", "memmap2", "mime", "min-max-heap", + "naive_bayes", "num_cpus", "once_cell", "optics", @@ -4277,18 +4473,25 @@ dependencies = [ "rust-s3", "rust-stemmers", "safetensors", + "schema", "scylla", "serde", "serde_json", "serde_urlencoded", + "simhash", + "sonic", + "spell", + "stdx", + "stract-config", + "stract-llm", + "stract-query", "tantivy", "tch", "thiserror", - "tikv-jemallocator", + "tokenizer", "tokenizers", "tokio", "tokio-stream", - "toml", "tower-http", "tracing", "tracing-subscriber", @@ -4297,9 +4500,46 @@ dependencies = [ "utoipa", "utoipa-swagger-ui", "uuid", + "warc", + "webgraph", + "webpage", "whatlang", ] +[[package]] +name = "stract-llm" +version = "0.1.0" +dependencies = [ + "bincode", + "byteorder", + "flate2", + "itertools 0.11.0", + "serde", + "stdx", + "tch", + "thiserror", + "tokenizers", + "tracing", +] + +[[package]] +name = "stract-query" +version = "0.1.0" +dependencies = [ + "itertools 0.11.0", + "min-max-heap", + "optics", + "proptest", + "schema", + "serde", + "serde_json", + "stdx", + "tantivy", + "url", + "urlencoding", + "utoipa", +] + [[package]] name = "string_cache" version = "0.8.7" @@ -4738,6 +4978,20 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizer" +version = "0.1.0" +dependencies = [ + "itertools 0.11.0", + "logos", + "serde", + "serde_json", + "stdx", + "tantivy", + "url", + "whatlang", +] + [[package]] name = "tokenizers" version = "0.13.4" @@ -5248,6 +5502,18 @@ dependencies = [ "try-lock", ] +[[package]] +name = "warc" +version = "0.1.0" +dependencies = [ + "chrono", + "encoding_rs", + "flate2", + "proptest", + "proptest-derive", + "thiserror", +] + [[package]] name = "wasi" version = "0.9.0+wasi-snapshot-preview1" @@ -5349,6 +5615,64 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webgraph" +version = "0.1.0" +dependencies = [ + "anyhow", + "bincode", + "bitvec", + "executor", + "hyperloglog", + "indicatif 0.17.7", + "itertools 0.11.0", + "kv", + "lz4_flex", + "maplit", + "md5", + "memmap2", + "rayon", + "rocksdb", + "serde", + "serde_json", + "stdx", + "tracing", + "url", + "utoipa", + "uuid", +] + +[[package]] +name = "webpage" +version = "0.1.0" +dependencies = [ + "anyhow", + "bincode", + "chrono", + "csv", + "itertools 0.11.0", + "kuchiki", + "maplit", + "naive_bayes", + "once_cell", + "publicsuffix", + "regex", + "rust-stemmers", + "schema", + "serde", + "serde_json", + "simhash", + "stdx", + "tantivy", + "thiserror", + "tokenizer", + "tracing", + "url", + "utoipa", + "webgraph", + "whatlang", +] + [[package]] name = "weezl" version = "0.1.7" diff --git a/Cargo.toml b/Cargo.toml index 7563f9fb6..007a22e42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["core", "optics", "kuchiki"] +members = ["crates/*", "lib/*"] resolver = "2" [profile.release] @@ -11,6 +11,7 @@ debug = true [workspace.dependencies] aes-gcm = "0.10.2" +alice = { path = "./crates/alice" } anyhow = {version = "1.0.72", features = ["backtrace"]} async-channel = "1.8.0" async-stream = "0.3.3" @@ -27,12 +28,17 @@ bzip2 = "0.4.4" chitchat = "0.5.0" chrono = {version = "0.4.23", features = ["serde"]} clap = {version = "4.4.6", features = ["derive"]} +collector = { path = "./crates/collector" } +crawler = { path = "./crates/crawler" } criterion = "0.5.1" crossbeam-channel = "0.5.6" csv = "1.1.6" dashmap = {version = "5.4.0", features = ["rayon"]} +distributed = { path = "crates/distributed" } encoding_rs = "0.8.31" +entity_index = { path = "./crates/entity_index" } eventsource-stream = "0.2.3" +executor = { path = "./crates/executor" } fend-core = "1.2.2" flate2 = "1.0.28" fst = {version = "0.4.7", features = ["levenshtein"]} @@ -40,10 +46,14 @@ futures = "0.3.21" half = {version = "2.2.1", features = ["serde"]} hashbrown = {version = "0.14.0", features = ["serde", "rkyv"]} http = "0.2.8" +hyperloglog = { path = "./crates/hyperloglog" } image = "0.24.3" +imager = { path = "./crates/imager" } indicatif = {version = "0.17.7", features = ["rayon"]} insta = "1.31" itertools = "0.11.0" +kuchiki = { path = "./lib/kuchiki" } +kv = { path = "./crates/kv" } lalrpop = {version = "0.20.0", features = ["lexer"]} lalrpop-util = {version = "0.20.0", features = ["lexer"]} libc = "0.2.142" @@ -51,13 +61,16 @@ logos = "0.13.0" lz-str = "0.2.1" lz4_flex = "0.11.1" maplit = "1.0.2" +mapreduce = { path = "./crates/mapreduce" } md5 = "0.7.0" memmap = "0.7.0" memmap2 = "0.9.0" mime = "0.3.17" min-max-heap = "1.3.0" +naive_bayes = { path = "./crates/naive_bayes" } num_cpus = "1.15.0" once_cell = "1.13.1" +optics = { path = "./crates/optics" } parse_wiki_text = "0.1.5" proptest = "1.2.0" proptest-derive = "0.4.0" @@ -85,14 +98,25 @@ rusqlite = {version = "0.29.0", features = [ rust-s3 = {version = "0.33.0", features = ["blocking", "tokio"]} rust-stemmers = "1.2.0" safetensors = "0.3.1" +schema = { path = "./crates/schema" } scylla = { git = "https://github.com/scylladb/scylla-rust-driver", rev = "82c1c99f0ff86509f9dd1e649ecdaddc5a3660cf" } serde = {version = "1.0.137", features = ["rc", "derive"]} serde_json = "1.0.81" serde_urlencoded = "0.7.1" +simhash = { path = "./crates/simhash" } +sonic = { path = "./lib/sonic" } +spell = { path = "./crates/spell" } +stdx = { path = "./crates/stdx" } +stract-cli = { path = "./crates/cli" } +stract-core = { path = "./crates/core" } +stract-config = { path = "./crates/config" } +stract-llm = { path = "./crates/llm" } +stract-query = { path = "./crates/query" } tantivy = {git = "https://github.com/quickwit-oss/tantivy", rev = "182f58cea"} tch = "0.13.0" thiserror = "1.0.31" tikv-jemallocator = "0.5" +tokenizer = { path = "lib/tokenizer" } tokenizers = "0.13.2" tokio = {version = "1.23.1", features = ["full"]} tokio-stream = "0.1.11" @@ -105,7 +129,11 @@ urlencoding = "2.1.2" utoipa = {version = "4.0.0", features = ["axum_extras"]} utoipa-swagger-ui = {version = "4.0.0", features = ["axum"]} uuid = "1.1.2" +warc = { path = "./crates/warc" } +webgraph = { path = "./crates/webgraph" } +webpage = { path = "./crates/webpage" } whatlang = "0.16.0" + [profile.test.package] flate2.opt-level = 3 diff --git a/core/src/entrypoint/crawler.rs b/core/src/entrypoint/crawler.rs deleted file mode 100644 index fe3f96e83..000000000 --- a/core/src/entrypoint/crawler.rs +++ /dev/null @@ -1,134 +0,0 @@ -// Stract is an open source web search engine. -// Copyright (C) 2023 Stract ApS -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -use std::{net::SocketAddr, sync::Arc}; - -use serde::{Deserialize, Serialize}; - -use crate::{ - config, - crawler::{self, planner::make_crawl_plan, CrawlCoordinator, Crawler}, - distributed::sonic::{self, service::Message}, - kv::rocksdb_store::RocksDbStore, - sonic_service, - webgraph::WebgraphBuilder, - Result, -}; - -pub async fn worker(config: config::CrawlerConfig) -> Result<()> { - let crawler = Crawler::new(config).await?; - - crawler.run().await; - - Ok(()) -} - -pub async fn coordinator(config: config::CrawlCoordinatorConfig) -> Result<()> { - let coordinator = Arc::new(CrawlCoordinator::new(config.job_queue)?); - - let addr: SocketAddr = config.host; - let server = coordinator::CoordinatorService { coordinator } - .bind(addr) - .await - .unwrap(); - - tracing::info!("Crawl coordinator listening on {}", addr); - - loop { - let _ = server.accept().await; - } -} - -pub async fn router(config: config::CrawlRouterConfig) -> Result<()> { - let router = crawler::Router::new(config.coordinator_addrs.clone()).await?; - - let addr: SocketAddr = config.host; - - let server = router::RouterService { router }.bind(addr).await.unwrap(); - - tracing::info!("Crawl router listening on {}", addr); - - loop { - let _ = server.accept().await; - } -} - -pub fn planner(config: config::CrawlPlannerConfig) -> Result<()> { - let page_centrality = RocksDbStore::open(&config.page_harmonic_path); - let host_centrality = RocksDbStore::open(&config.host_harmonic_path); - let page_graph = WebgraphBuilder::new(&config.page_graph_path).open(); - let host_graph = WebgraphBuilder::new(&config.host_graph_path).open(); - let output_path = config.output_path.clone(); - - make_crawl_plan( - host_centrality, - page_centrality, - host_graph, - page_graph, - config, - output_path, - )?; - - Ok(()) -} - -pub mod router { - use crate::crawler::Job; - - use super::*; - pub struct RouterService { - pub router: crawler::Router, - } - - sonic_service!(RouterService, [NewJob]); - - #[derive(Debug, Clone, Serialize, Deserialize)] - pub struct NewJob {} - - #[async_trait::async_trait] - impl Message for NewJob { - type Response = Option; - - async fn handle(self, server: &RouterService) -> sonic::Result { - Ok(server.router.sample_job().await?) - } - } -} - -pub mod coordinator { - use crate::crawler::Job; - - use super::*; - - pub struct CoordinatorService { - pub coordinator: Arc, - } - - sonic_service!(CoordinatorService, [GetJob]); - - #[derive(Debug, Clone, Serialize, Deserialize)] - pub struct GetJob {} - - #[async_trait::async_trait] - impl Message for GetJob { - type Response = Option; - - async fn handle(self, server: &CoordinatorService) -> sonic::Result { - let job = server.coordinator.sample_job()?; - Ok(job) - } - } -} diff --git a/core/src/entrypoint/mod.rs b/core/src/entrypoint/mod.rs deleted file mode 100644 index 230da23a0..000000000 --- a/core/src/entrypoint/mod.rs +++ /dev/null @@ -1,64 +0,0 @@ -// Stract is an open source web search engine. -// Copyright (C) 2023 Stract ApS -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -//! The entrypoint module contains all entrypoints that runs the executables. -#[cfg(feature = "with_alice")] -pub mod alice; -pub mod api; -pub mod autosuggest_scrape; -mod centrality; -#[cfg(feature = "dev")] -pub mod configure; -pub mod crawler; -pub mod dmoz_parser; -mod entity; -pub mod indexer; -pub mod safety_classifier; -pub mod search_server; -mod webgraph; -pub mod webgraph_server; - -pub use centrality::Centrality; -pub use entity::EntityIndexer; -pub use indexer::Indexer; -use tracing::{debug, log::error}; -pub use webgraph::Webgraph; - -use crate::{config, warc::WarcFile}; - -fn download_all_warc_files<'a>( - warc_paths: &'a [String], - source: &'a config::WarcSource, -) -> impl Iterator + 'a { - let warc_paths: Vec<_> = warc_paths - .iter() - .map(|warc_path| warc_path.to_string()) - .collect(); - - warc_paths.into_iter().filter_map(|warc_path| { - debug!("downloading warc file {}", &warc_path); - let res = WarcFile::download(source, &warc_path); - - if let Err(err) = res { - error!("error while downloading: {:?}", err); - return None; - } - - debug!("finished downloading"); - - Some(res.unwrap()) - }) -} diff --git a/core/src/query/optic.rs b/core/src/query/optic.rs deleted file mode 100644 index 4433ebaea..000000000 --- a/core/src/query/optic.rs +++ /dev/null @@ -1,1805 +0,0 @@ -// Stract is an open source web search engine. -// Copyright (C) 2023 Stract ApS -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -use itertools::Itertools; -use optics::{Action, MatchLocation, Matching, Optic, Rule}; -use tantivy::{ - query::{BooleanQuery, Occur, QueryClone}, - schema::Schema, -}; - -use crate::{fastfield_reader::FastFieldReader, schema::TextField}; - -use super::{const_query::ConstQuery, pattern_query::PatternQuery, union::UnionQuery}; - -pub trait AsTantivyQuery { - fn as_tantivy( - &self, - schema: &Schema, - fastfield_reader: &FastFieldReader, - ) -> Box; -} - -pub trait AsMultipleTantivyQuery { - fn as_multiple_tantivy( - &self, - schema: &Schema, - fastfield_reader: &FastFieldReader, - ) -> Vec<(Occur, Box)>; -} - -impl AsMultipleTantivyQuery for Optic { - fn as_multiple_tantivy( - &self, - schema: &Schema, - fastfields: &FastFieldReader, - ) -> Vec<(Occur, Box)> { - if self.discard_non_matching { - vec![( - Occur::Must, - UnionQuery::from( - self.rules - .iter() - .chain(self.site_rankings.rules().iter()) - .filter_map(|rule| rule.as_searchable_rule(schema, fastfields)) - .map(|(occur, rule)| { - BooleanQuery::from(vec![(occur, rule.query)]).box_clone() - }) - .collect_vec(), - ) - .box_clone(), - )] - } else { - self.rules - .iter() - .chain(self.site_rankings.rules().iter()) - .filter_map(|rule| rule.as_searchable_rule(schema, fastfields)) - .map(|(occur, rule)| (occur, rule.query)) - .collect() - } - } -} - -pub struct SearchableRule { - pub query: Box, - pub boost: f64, -} - -pub trait AsSearchableRule { - fn as_searchable_rule( - &self, - schema: &Schema, - fastfield_reader: &FastFieldReader, - ) -> Option<(Occur, SearchableRule)>; -} - -impl AsSearchableRule for Rule { - fn as_searchable_rule( - &self, - schema: &Schema, - fastfield_reader: &FastFieldReader, - ) -> Option<(Occur, SearchableRule)> { - let mut subqueries: Vec<_> = self - .matches - .iter() - .map(|matching| (Occur::Must, matching.as_tantivy(schema, fastfield_reader))) - .collect(); - - if subqueries.is_empty() { - return None; - } - - let subquery = if subqueries.len() == 1 { - subqueries.pop().unwrap().1 - } else { - Box::new(BooleanQuery::from(subqueries)) - }; - - match &self.action { - Action::Boost(boost) => Some(( - Occur::Should, - SearchableRule { - query: Box::new(ConstQuery::new(subquery, 1.0)), - boost: *boost as f64, - }, - )), - Action::Downrank(boost) => Some(( - Occur::Should, - SearchableRule { - query: Box::new(ConstQuery::new(subquery, 1.0)), - boost: *boost as f64 * -1.0, - }, - )), - Action::Discard => Some(( - Occur::MustNot, - SearchableRule { - query: subquery, - boost: 0.0, - }, - )), - } - } -} - -impl AsTantivyQuery for Matching { - fn as_tantivy( - &self, - schema: &Schema, - fastfield_reader: &FastFieldReader, - ) -> Box { - match &self.location { - MatchLocation::Site => ConstQuery::new( - PatternQuery::new( - self.pattern.clone(), - TextField::UrlForSiteOperator, - schema, - fastfield_reader.clone(), - ) - .box_clone(), - 1.0, - ) - .box_clone(), - MatchLocation::Url => Box::new(ConstQuery::new( - Box::new(PatternQuery::new( - self.pattern.clone(), - TextField::Url, - schema, - fastfield_reader.clone(), - )), - 1.0, - )), - MatchLocation::Domain => Box::new(ConstQuery::new( - Box::new(PatternQuery::new( - self.pattern.clone(), - TextField::Domain, - schema, - fastfield_reader.clone(), - )), - 1.0, - )), - MatchLocation::Title => Box::new(ConstQuery::new( - Box::new(PatternQuery::new( - self.pattern.clone(), - TextField::Title, - schema, - fastfield_reader.clone(), - )), - 1.0, - )), - MatchLocation::Description => UnionQuery::from(vec![ - Box::new(ConstQuery::new( - Box::new(PatternQuery::new( - self.pattern.clone(), - TextField::Description, - schema, - fastfield_reader.clone(), - )), - 1.0, - )) as Box, - Box::new(ConstQuery::new( - Box::new(PatternQuery::new( - self.pattern.clone(), - TextField::DmozDescription, - schema, - fastfield_reader.clone(), - )), - 1.0, - )) as Box, - ]) - .box_clone(), - MatchLocation::Content => Box::new(ConstQuery::new( - Box::new(PatternQuery::new( - self.pattern.clone(), - TextField::CleanBody, - schema, - fastfield_reader.clone(), - )), - 1.0, - )), - MatchLocation::MicroformatTag => Box::new(ConstQuery::new( - Box::new(PatternQuery::new( - self.pattern.clone(), - TextField::MicroformatTags, - schema, - fastfield_reader.clone(), - )), - 1.0, - )), - MatchLocation::Schema => Box::new(ConstQuery::new( - Box::new(PatternQuery::new( - self.pattern.clone(), - TextField::FlattenedSchemaOrgJson, - schema, - fastfield_reader.clone(), - )), - 1.0, - )), - } - } -} - -#[cfg(test)] -mod tests { - use optics::{Optic, SiteRankings}; - - use crate::{ - gen_temp_path, - index::Index, - ranking::inbound_similarity::InboundSimilarity, - searcher::{LocalSearcher, SearchQuery}, - webgraph::{Node, WebgraphWriter}, - webpage::{Html, Webpage}, - }; - - const CONTENT: &str = "this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever"; - - #[test] - fn discard_and_boost_sites() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website A - - - {CONTENT} {} - example example example - - - "#, - crate::rand_words(100) - ), - "https://www.a.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website B - - - {CONTENT} {} - - - "#, - crate::rand_words(100) - ), - "https://www.b.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.01, - page_centrality: 0.0, - - pre_computed_score: 0.0, - fetch_time_ms: 500, - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - - index.commit().expect("failed to commit index"); - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 2); - assert_eq!(res[0].url, "https://www.b.com/"); - assert_eq!(res[1].url, "https://www.a.com/"); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse( - r#" - Rule { - Matches { - Domain("b.com") - }, - Action(Discard) - } - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 1); - assert_eq!(res[0].url, "https://www.a.com/"); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse( - r#" - Rule { - Matches { - Domain("a.com") - }, - Action(Boost(10)) - } - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 2); - assert_eq!(res[0].url, "https://www.a.com/"); - assert_eq!(res[1].url, "https://www.b.com/"); - } - - #[test] - fn example_optics_dont_crash() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website A - - - {CONTENT} - example example example - - - "# - ), - "https://www.a.com/this/is/a/pattern", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website B - - - {CONTENT} - - - "# - ), - "https://www.b.com/this/is/b/pattern", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0001, - page_centrality: 0.0, - - pre_computed_score: 0.0, - fetch_time_ms: 500, - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - - index.commit().expect("failed to commit index"); - let searcher = LocalSearcher::from(index); - - let _ = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse(include_str!( - "../../../optics/testcases/samples/quickstart.optic" - )) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - let _ = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse(include_str!( - "../../../optics/testcases/samples/hacker_news.optic" - )) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - let _ = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse(include_str!( - "../../../optics/testcases/samples/copycats_removal.optic" - )) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - } - - #[test] - fn empty_discard() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website A - - - {CONTENT} {} - example example example - - - "#, - crate::rand_words(100) - ), - "https://www.a.com/this/is/a/pattern", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website B - - - {CONTENT} {} - - - "#, - crate::rand_words(100) - ), - "https://www.b.com/this/is/b/pattern", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0001, - page_centrality: 0.0, - - pre_computed_score: 0.0, - fetch_time_ms: 500, - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website B - - - {CONTENT} {} - - - "#, - crate::rand_words(100) - ), - "https://www.c.com/this/is/c/pattern", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0001, - page_centrality: 0.0, - - pre_computed_score: 0.0, - fetch_time_ms: 500, - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - - index.commit().expect("failed to commit index"); - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse( - r#" - DiscardNonMatching; - Rule { - Matches { - Domain("a.com") - }, - Action(Boost(6)) - }; - Rule { - Matches { - Domain("b.com") - }, - Action(Boost(1)) - }; - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 2); - assert_eq!(res[0].url, "https://www.a.com/this/is/a/pattern"); - } - - #[test] - fn liked_sites() { - let mut index = Index::temporary().expect("Unable to open index"); - - let mut writer = WebgraphWriter::new( - gen_temp_path(), - crate::executor::Executor::single_thread(), - crate::webgraph::Compression::default(), - ); - - writer.insert( - Node::from("https://www.e.com").into_host(), - Node::from("https://www.a.com").into_host(), - String::new(), - ); - writer.insert( - Node::from("https://www.a.com").into_host(), - Node::from("https://www.e.com").into_host(), - String::new(), - ); - - writer.insert( - Node::from("https://www.c.com").into_host(), - Node::from("https://www.c.com").into_host(), - String::new(), - ); - - writer.insert( - Node::from("https://www.b.com").into_host(), - Node::from("https://www.e.com").into_host(), - String::new(), - ); - writer.insert( - Node::from("https://www.e.com").into_host(), - Node::from("https://www.b.com").into_host(), - String::new(), - ); - - let graph = writer.finalize(); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website A - - - {CONTENT} {} - example example example - - - "#, - crate::rand_words(100) - ), - "https://www.a.com/this/is/a/pattern", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - dmoz_description: None, - safety_classification: None, - node_id: Some(Node::from("www.a.com").into_host().id()), - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website B - - - {CONTENT} {} - - - "#, - crate::rand_words(100) - ), - "https://www.b.com/this/is/b/pattern", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0001, - page_centrality: 0.0, - - pre_computed_score: 0.0, - fetch_time_ms: 500, - dmoz_description: None, - safety_classification: None, - node_id: Some(Node::from("www.b.com").into_host().id()), - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website C - - - {CONTENT} {} - - - "#, - crate::rand_words(100) - ), - "https://www.c.com/this/is/c/pattern", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0002, - page_centrality: 0.0, - - pre_computed_score: 0.0, - fetch_time_ms: 500, - dmoz_description: None, - safety_classification: None, - node_id: Some(Node::from("www.c.com").into_host().id()), - }) - .expect("failed to insert webpage"); - - index.commit().expect("failed to commit index"); - let mut searcher = LocalSearcher::from(index); - - searcher.set_inbound_similarity(InboundSimilarity::build(&graph)); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse( - r#" - Like(Site("www.a.com")); - Like(Site("www.b.com")); - Dislike(Site("www.c.com")); - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 3); - assert_eq!(res[0].url, "https://www.b.com/this/is/b/pattern"); - assert_eq!(res[1].url, "https://www.a.com/this/is/a/pattern"); - assert_eq!(res[2].url, "https://www.c.com/this/is/c/pattern"); - } - - #[test] - fn schema_org_search() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website A - - - - {CONTENT} {} - example example example - - - "#, - crate::rand_words(100) - ), - "https://www.a.com/", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r##" - - - Website B - - -
-
-

Comments

-
- -
-

Posted by: - Greg -

-

-
-

Ha!

-
-
-
- {CONTENT} {} - - - "##, - crate::rand_words(100) - ), - "https://www.b.com/", - ).unwrap(), - backlink_labels: vec![], - host_centrality: 0.0001, - page_centrality: 0.0, - - pre_computed_score: 0.0, - fetch_time_ms: 500, - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - - index.commit().unwrap(); - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse( - r#" - DiscardNonMatching; - Rule { - Matches { - Schema("BlogPosting") - } - } - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 1); - assert_eq!(res[0].url, "https://www.b.com/"); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse( - r#" - DiscardNonMatching; - Rule { - Matches { - Schema("BlogPosting.comment") - } - } - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 1); - assert_eq!(res[0].url, "https://www.b.com/"); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse( - r#" - DiscardNonMatching; - Rule { - Matches { - Schema("ImageObject") - } - } - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 1); - assert_eq!(res[0].url, "https://www.a.com/"); - - let res = searcher - .search(&SearchQuery { - query: "website".to_string(), - optic: Some( - Optic::parse( - r#" - DiscardNonMatching; - Rule { - Matches { - Schema("Person") - } - } - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 1); - assert_eq!(res[0].url, "https://www.b.com/"); - } - - #[test] - fn pattern_same_phrase() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website A - - - {CONTENT} {} - example example example - - - "#, - crate::rand_words(100) - ), - "https://chat.stackoverflow.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - - index.commit().expect("failed to commit index"); - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "site:stackoverflow.com".to_string(), - optic: Some( - Optic::parse( - r#" - DiscardNonMatching; - Rule { - Matches { - Site("a.com") - }, - Action(Boost(6)) - }; - Rule { - Matches { - Site("stackoverflow.blog") - }, - Action(Boost(1)) - }; - Rule { - Matches { - Site("chat.b.eu") - }, - Action(Boost(1)) - }; - "#, - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 0); - } - - #[test] - fn discard_all_discard_like() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website A - - - {CONTENT} {} - example example example - - - "#, - crate::rand_words(100) - ), - "https://a.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Website B - - - {CONTENT} {} - example example example - - - "#, - crate::rand_words(100) - ), - "https://b.com/", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - - index.commit().expect("failed to commit index"); - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse( - r#" - DiscardNonMatching; - Rule { - Matches { - Site("b.com") - } - }; - "#, - ) - .unwrap(), - ), - site_rankings: Some(SiteRankings { - liked: vec!["a.com".to_string()], - disliked: vec![], - blocked: vec![], - }), - ..Default::default() - }) - .unwrap() - .webpages; - - assert_eq!(res.len(), 1); - assert_eq!(res[0].url, "https://b.com/"); - } - - #[test] - fn discussion_optic() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - include_str!("../../testcases/schema_org/infinity_war.html"), - "https://a.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index.commit().expect("failed to commit index"); - - let searcher = LocalSearcher::from(index); - let res = searcher - .search(&SearchQuery { - query: "avengers endgame".to_string(), - ..Default::default() - }) - .unwrap() - .webpages; - - assert!(!res.is_empty()); - assert_eq!(&res[0].url, "https://a.com/"); - - let res = searcher - .search(&SearchQuery { - query: "avengers endgame".to_string(), - optic: Some(Optic::parse(include_str!("../searcher/discussions.optic")).unwrap()), - ..Default::default() - }) - .unwrap() - .webpages; - assert!(res.is_empty()); - } - - #[test] - fn special_pattern_syntax() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - This is an example website - - - {CONTENT} {} - This is an example - - - "#, - crate::rand_words(1000) - ), - "https://example.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index.commit().expect("failed to commit index"); - - let searcher = LocalSearcher::from(index); - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - assert_eq!(res[0].url, "https://example.com/"); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"is\") }, Action(Discard) }").unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 0); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"|is\") }, Action(Discard) }").unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"|This\") }, Action(Discard) }").unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 0); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"|This an\") }, Action(Discard) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"|This * an\") }, Action(Discard) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 0); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Site(\"example.com\") }, Action(Discard) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 0); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Site(\"|example.com\") }, Action(Discard) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 0); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Site(\"|example.com|\") }, Action(Discard) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 0); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"website.com|\") }, Action(Discard) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - } - - #[test] - fn active_optic_with_blocked_sites() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - This is an example website - - - {CONTENT} {} - This is an example - - - "#, - crate::rand_words(1000) - ), - "https://example.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index.commit().expect("failed to commit index"); - - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse( - "DiscardNonMatching; Rule { Matches { Title(\"is\") }, Action(Boost(0)) }", - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse( - "DiscardNonMatching; Rule { Matches { Title(\"is\") }, Action(Boost(0)) }", - ) - .unwrap(), - ), - site_rankings: Some(SiteRankings { - liked: vec![], - disliked: vec![], - blocked: vec![String::from("example.com")], - }), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 0); - } - - #[test] - fn empty_optic_noop() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - This is an example website - - - {CONTENT} {} - This is an example - - - "#, - crate::rand_words(1000) - ), - "https://example.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index.commit().expect("failed to commit index"); - - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some(Optic::parse("").unwrap()), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"\") }, Action(Discard) }").unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - } - - #[test] - fn wildcard_edge_cases() { - let mut index = Index::temporary().expect("Unable to open index"); - - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - This is an example website - - - {CONTENT} {} - This is an example - - - "#, - crate::rand_words(1000) - ), - "https://example.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index - .insert(Webpage { - html: Html::parse( - &format!( - r#" - - - Another thing with no words in common - - - {CONTENT} {} - This is an example - - - "#, - crate::rand_words(1000) - ), - "https://example.com", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }) - .expect("failed to insert webpage"); - index.commit().expect("failed to commit index"); - - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"*\") }, Action(Discard) }").unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 0); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"* is\") }, Action(Discard) }").unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"* This is\") }, Action(Discard) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("Rule { Matches { Title(\"example *\") }, Action(Discard) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse( - "Rule { Matches { Title(\"example website *\") }, Action(Discard) }", - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - } - - #[test] - fn empty_double_anchor() { - let mut index = Index::temporary().expect("Unable to open index"); - - let mut page = Webpage { - html: Html::parse( - r#" - - - This is an example website - - - Test - - - "#, - "https://example.com/", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }; - - page.html.set_clean_text("".to_string()); - - index.insert(page).expect("failed to insert webpage"); - index.commit().expect("failed to commit index"); - - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse("DiscardNonMatching; Rule { Matches { Content(\"||\") }, Action(Boost(0)) }") - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse( - "DiscardNonMatching; Rule { Matches { Content(\"|\") }, Action(Boost(0)) }", - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - } - - #[test] - fn indieweb_search() { - let mut index = Index::temporary().expect("Unable to open index"); - - let mut page = Webpage { - html: Html::parse( - r#" - - - This is an example indie website - - -
-

Microformats are amazing

-

This is the content of the article

- Permalink - Author - -
- - - "#, - "https://example.com/", - ).unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }; - - page.html.set_clean_text("".to_string()); - - index.insert(page).expect("failed to insert webpage"); - - let mut page = Webpage { - html: Html::parse( - r#" - - - This is an example non-indie website - - - example example example - - - "#, - "https://non-indie-example.com/", - ) - .unwrap(), - backlink_labels: vec![], - host_centrality: 0.0, - page_centrality: 0.0, - fetch_time_ms: 500, - pre_computed_score: 0.0, - - node_id: None, - dmoz_description: None, - safety_classification: None, - }; - - page.html.set_clean_text("".to_string()); - - index.insert(page).expect("failed to insert webpage"); - index.commit().expect("failed to commit index"); - - let searcher = LocalSearcher::from(index); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 2); - - let res = searcher - .search(&SearchQuery { - query: "example".to_string(), - optic: Some( - Optic::parse( - "DiscardNonMatching; Rule { Matches { MicroformatTag(\"|h-*\") } }", - ) - .unwrap(), - ), - ..Default::default() - }) - .unwrap() - .webpages; - assert_eq!(res.len(), 1); - assert_eq!(res[0].domain, "example.com"); - } -} diff --git a/crates/alice/Cargo.toml b/crates/alice/Cargo.toml new file mode 100644 index 000000000..6b506ddc6 --- /dev/null +++ b/crates/alice/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "alice" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +aes-gcm = "0.10.3" +anyhow.workspace = true +base64.workspace = true +bincode.workspace = true +flate2.workspace = true +half.workspace = true +memmap2.workspace = true +safetensors.workspace = true +serde.workspace = true +serde_json.workspace = true +stdx.workspace = true +stract-config.workspace = true +stract-llm.workspace = true +tch.workspace = true +thiserror.workspace = true +tokenizers.workspace = true +tracing.workspace = true +utoipa.workspace = true diff --git a/core/src/alice/generate.rs b/crates/alice/src/generate.rs similarity index 98% rename from core/src/alice/generate.rs rename to crates/alice/src/generate.rs index 75f4db58a..86735a148 100644 --- a/core/src/alice/generate.rs +++ b/crates/alice/src/generate.rs @@ -17,19 +17,15 @@ use std::{rc::Rc, sync::Arc}; use aes_gcm::{Aes256Gcm, Key}; +use stdx::leaky_queue::LeakyQueue; +use stract_llm::llm_utils::{self, ClonableTensor}; use tch::{IndexOp, Kind, Tensor}; use crate::{ - alice::ExecutionState, - leaky_queue::LeakyQueue, - llm_utils::{self, ClonableTensor}, -}; - -use super::{ raw_model::RawModel, AnyTransitionValidator, EncodedEncryptedState, EncryptedState, Error, - ModelState, Result, Searcher, Tokenizer, TransitionValidator, + ExecutionState, ModelState, ModelWebsite, Result, Searcher, SimplifiedWebsite, Tokenizer, + TransitionValidator, }; -use crate::alice::{ModelWebsite, SimplifiedWebsite}; // const TAU: f64 = 0.3; const TEMP: f64 = 0.4; // 0.4 @@ -590,7 +586,7 @@ impl Iterator for ActionGenerator { pub struct ActionExecutor { generator: ActionGenerator, - searcher: Searcher, + searcher: Box, query_to_search: Option, queries_performed: Vec, has_finished: bool, @@ -600,9 +596,9 @@ pub struct ActionExecutor { unsafe impl Send for ActionExecutor {} impl ActionExecutor { - pub fn new( + pub(super) fn new( action_gen: ActionGenerator, - searcher: Searcher, + searcher: Box, encryption_key: Key, ) -> Self { ActionExecutor { @@ -614,7 +610,7 @@ impl ActionExecutor { encryption_key, } } - pub fn state(&self) -> Tensor { + pub(super) fn state(&self) -> Tensor { self.generator .raw .token_generator diff --git a/core/src/alice/mod.rs b/crates/alice/src/lib.rs similarity index 79% rename from core/src/alice/mod.rs rename to crates/alice/src/lib.rs index 2c4164656..dafaf2165 100644 --- a/core/src/alice/mod.rs +++ b/crates/alice/src/lib.rs @@ -24,6 +24,9 @@ //! To make sure the state has not been tampered with, it is encrypted //! using an AES-GCM key. +pub mod generate; +mod raw_model; + use std::{ io::{Read, Write}, path::Path, @@ -40,25 +43,13 @@ pub use base64::prelude::BASE64_STANDARD as BASE64_ENGINE; use base64::Engine; use flate2::{bufread::GzDecoder, write::GzEncoder, Compression}; use half::bf16; -use itertools::Itertools; +use stract_config::{AcceleratorDevice, AcceleratorDtype, AliceAcceleratorConfig}; +use stract_llm::llm_utils::ClonableTensor; use tch::Tensor; -use url::Url; use utoipa::ToSchema; -use crate::{ - api::search::ApiSearchQuery, - config::AcceleratorDevice, - config::AcceleratorDtype, - config::AliceAcceleratorConfig, - llm_utils::ClonableTensor, - search_prettifier::DisplayedWebpage, - searcher::{SearchResult, WebsitesResult}, -}; - -use self::{ - generate::{ActionExecutor, ActionGenerator, AliceTokenGenerator, RawActionGenerator}, - raw_model::RawModel, -}; +use generate::{ActionExecutor, ActionGenerator, AliceTokenGenerator, RawActionGenerator}; +use raw_model::RawModel; const PROMPT_PREFIX: &str = r#"System: Your name is Alice, and you are an AI assistant trained by Stract. Below is a conversation between you and a user. Help the user as best you can. You can lookup information on the web in the following format: @@ -70,9 +61,6 @@ This Thought/Search/Result can repeat N times. When you are ready to answer the user, use the following format: Alice: <|endoftext|>"#; -pub mod generate; -mod raw_model; - type Result = std::result::Result; #[derive(thiserror::Error, Debug)] @@ -80,9 +68,6 @@ pub enum Error { #[error("Empty input")] EmptyInput, - #[error("Unexpected search result")] - UnexpectedSearchResult, - #[error("Failed to decrypt")] DecryptionFailed, @@ -93,33 +78,6 @@ pub enum Error { LastMessageNotUser, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, ToSchema)] -pub struct SimplifiedWebsite { - pub title: String, - pub text: String, - pub url: String, - pub site: String, -} - -impl SimplifiedWebsite { - fn new(webpage: DisplayedWebpage) -> Self { - let text = webpage - .snippet - .text() - .map(|t| t.fragments.iter().map(|f| f.text()).join("")) - .unwrap_or_default(); - - let url = Url::parse(&webpage.url).unwrap(); - - Self { - title: webpage.title, - text, - site: url.host_str().unwrap_or_default().to_string(), - url: url.to_string(), - } - } -} - #[derive(Debug, serde::Serialize, serde::Deserialize, ToSchema)] #[serde(tag = "type", rename_all = "camelCase")] pub enum ExecutionState { @@ -138,6 +96,18 @@ pub enum ExecutionState { }, } +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, ToSchema)] +pub struct SimplifiedWebsite { + pub title: String, + pub text: String, + pub url: String, + pub site: String, +} + +pub trait Searcher { + fn search(&self, query: &str) -> Result>; +} + /// A simplified website that the model sees in the prompt. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] struct ModelWebsite { @@ -156,57 +126,6 @@ impl From for ModelWebsite { } } -pub struct Searcher { - url: String, - optic_url: Option, -} - -impl Searcher { - fn raw_search(&self, query: &str) -> Result { - let optic = self - .optic_url - .as_ref() - .and_then(|url| reqwest::blocking::get(url).ok().and_then(|r| r.text().ok())); - - let client = reqwest::blocking::Client::new(); - let query = ApiSearchQuery { - query: query.trim().to_string(), - num_results: Some(3), - optic, - page: None, - selected_region: None, - site_rankings: None, - return_ranking_signals: false, - flatten_response: false, - safe_search: Some(false), - fetch_discussions: false, - count_results: false, - }; - tracing::debug!("searching at {:?}: {:#?}", self.url, query); - - let res: SearchResult = client.post(&self.url).json(&query).send()?.json()?; - - match res { - SearchResult::Websites(res) => Ok(res), - SearchResult::Bang(_) => Err(Error::UnexpectedSearchResult.into()), - } - } - - fn search(&self, query: &str) -> Result> { - let res = self.raw_search(query)?; - - let mut websites = Vec::new(); - - for website in res.webpages { - websites.push(SimplifiedWebsite::new(website)); - } - - tracing::debug!("search result: {:#?}", websites); - - Ok(websites) - } -} - pub struct Alice { inner: Rc, tokenizer: Arc, @@ -228,44 +147,35 @@ pub struct AcceleratorConfig { pub kind: tch::Kind, } -impl From for tch::Device { - fn from(value: AcceleratorDevice) -> Self { - match value { +impl From for AcceleratorConfig { + fn from(value: AliceAcceleratorConfig) -> Self { + let device = match value.device { AcceleratorDevice::Cpu => tch::Device::Cpu, AcceleratorDevice::Cuda(d) => tch::Device::Cuda(d), AcceleratorDevice::Mps => tch::Device::Mps, - } - } -} - -impl From for tch::Kind { - fn from(value: AcceleratorDtype) -> Self { - match value { + }; + let kind = match value.dtype { AcceleratorDtype::Float => tch::Kind::Float, AcceleratorDtype::Bf16 => tch::Kind::BFloat16, - } - } -} + }; -impl From for AcceleratorConfig { - fn from(value: AliceAcceleratorConfig) -> Self { Self { layer_fraction: value.layer_fraction, quantize_fraction: value.quantize_fraction, - device: value.device.into(), - kind: value.dtype.into(), + device, + kind, } } } impl Alice { - pub fn open>( - folder: P, + pub fn open( + folder: &Path, accelerator: Option, encryption_key: &[u8], ) -> Result { let encryption_key = *Key::::from_slice(encryption_key); - let mut model = RawModel::open(folder.as_ref().join("model.safetensors"))?; + let mut model = RawModel::open(&folder.join("model.safetensors"))?; if let Some(accelerator) = accelerator { model.load_to_device( @@ -277,7 +187,7 @@ impl Alice { } let inner = Rc::new(model); - let tokenizer = Arc::new(Tokenizer::open(folder.as_ref().join("tokenizer.json"))?); + let tokenizer = Arc::new(Tokenizer::open(&folder.join("tokenizer.json"))?); let end_tokens = vec![tokenizer.tokenizer.token_to_id("<|endoftext|>").unwrap() as i64]; @@ -305,12 +215,19 @@ impl Alice { Ok(ClonableTensor(state)) } - pub fn new_executor( + pub fn new_executor( + &self, + user_question: &str, + last_state: Option, + searcher: S, + ) -> Result { + self.new_executor_inner(user_question, last_state, Box::new(searcher)) + } + fn new_executor_inner( &self, user_question: &str, last_state: Option, - search_url: String, - optic_url: Option, + searcher: Box, ) -> Result { let mut state = None; @@ -345,11 +262,6 @@ impl Alice { let raw_action_gen = RawActionGenerator::new(token_generator, self.tokenizer.clone()); let action_gen = ActionGenerator::new(raw_action_gen); - let searcher = Searcher { - url: search_url, - optic_url, - }; - Ok(ActionExecutor::new( action_gen, searcher, @@ -369,7 +281,7 @@ pub struct Tokenizer { } impl Tokenizer { - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { let tokenizer = tokenizers::Tokenizer::from_file(path).map_err(|e| anyhow!(e))?; Ok(Self { tokenizer }) diff --git a/core/src/alice/raw_model.rs b/crates/alice/src/raw_model.rs similarity index 99% rename from core/src/alice/raw_model.rs rename to crates/alice/src/raw_model.rs index 0c7a900cc..3b8e1a53d 100644 --- a/core/src/alice/raw_model.rs +++ b/crates/alice/src/raw_model.rs @@ -15,7 +15,7 @@ use std::{fs::OpenOptions, path::Path}; // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::alice::Result; +use crate::Result; use safetensors::SafeTensors; use tch::{ nn::{embedding, layer_norm, Embedding, LayerNorm, LayerNormConfig, ModuleT, VarStore}, @@ -521,7 +521,7 @@ pub struct RawModel { } impl RawModel { - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { // SAFETY // broadcasting: please don't modify the file while we're reading it. let mmap = unsafe { diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml new file mode 100644 index 000000000..99c9331dd --- /dev/null +++ b/crates/cli/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "stract-cli" +version = "0.1.0" +edition = "2021" +default-run = "stract" + +[[bin]] +name = "stract" +path = "src/main.rs" + +[features] +cors = ["stract-core/cors"] +default = [] +dev = ["cors"] +libtorch = ["stract-core/libtorch"] +prod = ["cors", "libtorch"] +with_alice = ["libtorch"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +clap.workspace = true +serde.workspace = true +stract-config.workspace = true +stract-core.workspace = true +tokio.workspace = true +toml.workspace = true +tracing.workspace = true +tracing-subscriber.workspace = true +webgraph.workspace = true + +[target.'cfg(not(target_env = "msvc"))'.dependencies] +tikv-jemallocator.workspace = true diff --git a/core/src/main.rs b/crates/cli/src/main.rs similarity index 78% rename from core/src/main.rs rename to crates/cli/src/main.rs index 9876fcdd5..4651a5d1d 100644 --- a/core/src/main.rs +++ b/crates/cli/src/main.rs @@ -17,16 +17,15 @@ use anyhow::{Context, Result}; use clap::{Parser, Subcommand}; use serde::de::DeserializeOwned; use std::fs; -use std::path::Path; -use stract::config; -use stract::entrypoint::autosuggest_scrape::{self, Gl}; +use std::path::{Path, PathBuf}; +use stract_core::entrypoint::autosuggest_scrape::{self, Gl}; +use webgraph::WebgraphBuilder; #[cfg(feature = "dev")] -use stract::entrypoint::configure; +use stract_core::entrypoint::configure; -use stract::entrypoint::indexer::IndexPointer; -use stract::entrypoint::{self, api, safety_classifier, search_server, webgraph_server}; -use stract::webgraph::WebgraphBuilder; +use stract_core::entrypoint::indexer::IndexPointer; +use stract_core::entrypoint::{self, api, safety_classifier, search_server, webgraph_server}; use tracing_subscriber::prelude::*; #[cfg(not(target_env = "msvc"))] @@ -61,8 +60,8 @@ enum Commands { /// Parse the DMOZ dataset. DMOZ contains a list of websites and their categories. /// It can be used to calculate the topic centrality for websites or augments website descriptions during indexing. DmozParser { - dmoz_file: String, - output_path: String, + dmoz_file: PathBuf, + output_path: PathBuf, }, /// Webgraph specific commands. @@ -72,18 +71,18 @@ enum Commands { }, /// Deploy the search server. - SearchServer { config_path: String }, + SearchServer { config_path: PathBuf }, /// Deploy the json http api. The api interacts with /// the search servers, webgraph servers etc. to provide the necesarry functionality. - Api { config_path: String }, + Api { config_path: PathBuf }, /// Scrape the Google autosuggest API for search queries. AutosuggestScrape { num_queries: usize, gl: Gl, ms_sleep_between_req: u64, - output_dir: String, + output_dir: PathBuf, }, /// Deploy the crawler. @@ -113,18 +112,18 @@ enum Commands { enum Crawler { /// Deploy the crawl worker. The worker is responsible for downloading webpages, saving them to S3, /// and sending newly discovered urls back to the crawl coordinator. - Worker { config_path: String }, + Worker { config_path: PathBuf }, /// Deploy the crawl coordinator. The crawl coordinator is responsible for /// distributing crawl jobs to the crawles and deciding which urls to crawl next. - Coordinator { config_path: String }, + Coordinator { config_path: PathBuf }, /// Deploy the crawl router. The crawl router is responsible for routing job responses and requests /// from the workers to the correct crawl coordinators. - Router { config_path: String }, + Router { config_path: PathBuf }, /// Create a crawl plan. - Plan { config_path: String }, + Plan { config_path: PathBuf }, } /// Commands to train or run inference on the classifier that predicts if a webpage is NSFW or SFW. @@ -132,62 +131,61 @@ enum Crawler { enum SafetyClassifierOptions { /// Train the classifier Train { - dataset_path: String, - output_path: String, + dataset_path: PathBuf, + output_path: PathBuf, }, /// Run a single prediction to test the model - Predict { model_path: String, text: String }, + Predict { model_path: PathBuf, text: String }, } #[derive(Subcommand)] enum CentralityMode { /// Calculate metrics for the host webgraph. Host { - webgraph_path: String, - output_path: String, + webgraph_path: PathBuf, + output_path: PathBuf, }, /// Calculate metrics for the page webgraph. Page { - webgraph_path: String, - host_centrality_path: String, - output_path: String, + webgraph_path: PathBuf, + host_centrality_path: PathBuf, + output_path: PathBuf, }, } #[derive(Subcommand)] enum WebgraphOptions { /// Create a new webgraph. - Create { config_path: String }, + Create { config_path: PathBuf }, /// Merge multiple webgraphs into a single graph. Merge { #[clap(required = true)] - paths: Vec, + paths: Vec, }, /// Deploy the webgraph server. The webgraph server is responsible for serving the webgraph to the search servers. /// This is e.g. used to find similar sites etc. - Server { config_path: String }, + Server { config_path: PathBuf }, } #[derive(Subcommand)] enum IndexingOptions { /// Create the search index. - Search { config_path: String }, + Search { config_path: PathBuf }, /// Create the entity index. Used in the sidebar of the search UI. Entity { - wikipedia_dump_path: String, - output_path: String, + wikipedia_dump_path: PathBuf, + output_path: PathBuf, }, /// Merge multiple search indexes into a single index. - Merge { indexes: Vec }, + Merge { indexes: Vec }, } -fn load_toml_config>(path: P) -> T { - let path = path.as_ref(); +fn load_toml_config(path: &Path) -> T { let raw_config = fs::read_to_string(path) .with_context(|| format!("Failed to read config: '{}'", path.display())) .unwrap(); @@ -210,13 +208,13 @@ fn main() -> Result<()> { match args.command { Commands::Indexer { options } => match options { IndexingOptions::Search { config_path } => { - let config = load_toml_config(config_path); + let config = load_toml_config(&config_path); entrypoint::Indexer::run(&config)?; } IndexingOptions::Entity { wikipedia_dump_path, output_path, - } => entrypoint::EntityIndexer::run(wikipedia_dump_path, output_path)?, + } => entrypoint::EntityIndexer::run(&wikipedia_dump_path, &output_path)?, IndexingOptions::Merge { indexes } => { entrypoint::Indexer::merge(indexes.into_iter().map(IndexPointer::from).collect())? } @@ -235,20 +233,20 @@ fn main() -> Result<()> { host_centrality_path, output_path, } => entrypoint::Centrality::build_derived_harmonic( - webgraph_path, - host_centrality_path, - output_path, + &webgraph_path, + &host_centrality_path, + &output_path, )?, } tracing::info!("Done"); } Commands::Webgraph { options } => match options { WebgraphOptions::Create { config_path } => { - let config = load_toml_config(config_path); + let config = load_toml_config(&config_path); entrypoint::Webgraph::run(&config)?; } WebgraphOptions::Merge { mut paths } => { - let mut webgraph = WebgraphBuilder::new(paths.remove(0)).open(); + let mut webgraph = WebgraphBuilder::new(&paths.remove(0)).open(); for other_path in paths { let other = WebgraphBuilder::new(&other_path).open(); @@ -257,7 +255,7 @@ fn main() -> Result<()> { } } WebgraphOptions::Server { config_path } => { - let config: config::WebgraphServerConfig = load_toml_config(config_path); + let config: stract_config::WebgraphServerConfig = load_toml_config(&config_path); tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -266,7 +264,7 @@ fn main() -> Result<()> { } }, Commands::Api { config_path } => { - let config: config::ApiConfig = load_toml_config(config_path); + let config: stract_config::ApiConfig = load_toml_config(&config_path); tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -274,7 +272,7 @@ fn main() -> Result<()> { .block_on(api::run(config))? } Commands::SearchServer { config_path } => { - let config: config::SearchServerConfig = load_toml_config(config_path); + let config: stract_config::SearchServerConfig = load_toml_config(&config_path); tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -287,7 +285,7 @@ fn main() -> Result<()> { ms_sleep_between_req, output_dir, } => { - autosuggest_scrape::run(queries_to_scrape, gl, ms_sleep_between_req, output_dir)?; + autosuggest_scrape::run(queries_to_scrape, gl, ms_sleep_between_req, &output_dir)?; } #[cfg(feature = "dev")] Commands::Configure { @@ -303,10 +301,10 @@ fn main() -> Result<()> { Commands::DmozParser { dmoz_file, output_path, - } => entrypoint::dmoz_parser::run(dmoz_file, output_path).unwrap(), + } => entrypoint::dmoz_parser::run(&dmoz_file, &output_path).unwrap(), Commands::Crawler { options } => match options { Crawler::Worker { config_path } => { - let config: config::CrawlerConfig = load_toml_config(config_path); + let config: stract_config::CrawlerConfig = load_toml_config(&config_path); tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -314,7 +312,7 @@ fn main() -> Result<()> { .block_on(entrypoint::crawler::worker(config))? } Crawler::Coordinator { config_path } => { - let config: config::CrawlCoordinatorConfig = load_toml_config(config_path); + let config: stract_config::CrawlCoordinatorConfig = load_toml_config(&config_path); tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -322,7 +320,7 @@ fn main() -> Result<()> { .block_on(entrypoint::crawler::coordinator(config))? } Crawler::Router { config_path } => { - let config: config::CrawlRouterConfig = load_toml_config(config_path); + let config: stract_config::CrawlRouterConfig = load_toml_config(&config_path); tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -330,7 +328,7 @@ fn main() -> Result<()> { .block_on(entrypoint::crawler::router(config))? } Crawler::Plan { config_path } => { - let config: config::CrawlPlannerConfig = load_toml_config(config_path); + let config: stract_config::CrawlPlannerConfig = load_toml_config(&config_path); entrypoint::crawler::planner(config)?; } @@ -339,9 +337,9 @@ fn main() -> Result<()> { SafetyClassifierOptions::Train { dataset_path, output_path, - } => safety_classifier::train(dataset_path, output_path)?, + } => safety_classifier::train(&dataset_path, &output_path)?, SafetyClassifierOptions::Predict { model_path, text } => { - safety_classifier::predict(model_path, &text)? + safety_classifier::predict(&model_path, &text)? } }, } diff --git a/crates/collector/Cargo.toml b/crates/collector/Cargo.toml new file mode 100644 index 000000000..09858c062 --- /dev/null +++ b/crates/collector/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "collector" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +min-max-heap.workspace = true +schema.workspace = true +serde.workspace = true +simhash.workspace = true +stdx.workspace = true +stract-config.workspace = true +tantivy.workspace = true diff --git a/core/src/collector.rs b/crates/collector/src/lib.rs similarity index 94% rename from core/src/collector.rs rename to crates/collector/src/lib.rs index f3737b7e6..569452160 100644 --- a/core/src/collector.rs +++ b/crates/collector/src/lib.rs @@ -17,24 +17,50 @@ use std::{collections::HashMap, sync::Arc}; use min_max_heap::MinMaxHeap; +use schema::{fastfield_reader, FastField}; use serde::{Deserialize, Serialize}; +use stdx::prehashed::Prehashed; +use stract_config::CollectorConfig; use tantivy::{ collector::{Collector, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector}, DocId, SegmentOrdinal, SegmentReader, }; -use crate::{ - combine_u64s, - config::CollectorConfig, - fastfield_reader, - inverted_index::{DocAddress, WebsitePointer}, - prehashed::Prehashed, - ranking::initial::{InitialScoreTweaker, Score}, - schema::FastField, - simhash, -}; +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub struct Score { + pub total: f64, +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub struct WebsitePointer { + pub score: Score, + pub hashes: Hashes, + pub address: DocAddress, +} -pub type MainCollector = TweakedScoreTopCollector; +#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)] +pub struct DocAddress { + pub segment: u32, + pub doc_id: u32, +} + +impl From for DocAddress { + fn from(address: tantivy::DocAddress) -> Self { + Self { + segment: address.segment_ord, + doc_id: address.doc_id, + } + } +} + +impl From for tantivy::DocAddress { + fn from(address: DocAddress) -> Self { + Self { + segment_ord: address.segment, + doc_id: address.doc_id, + } + } +} #[derive(Clone)] pub struct MaxDocsConsidered { @@ -101,7 +127,7 @@ impl TopDocs { self } - pub fn main_collector(self, score_tweaker: InitialScoreTweaker) -> MainCollector { + pub fn main_collector(self, score_tweaker: T) -> TweakedScoreTopCollector { TweakedScoreTopCollector::new(score_tweaker, self) } } @@ -155,7 +181,7 @@ impl TopSegmentCollector { let hash2 = hash.unwrap(); let hash = [hash1, hash2]; - combine_u64s(hash).into() + stdx::combine_u64s(hash).into() } } diff --git a/crates/config/Cargo.toml b/crates/config/Cargo.toml new file mode 100644 index 000000000..f9e36602d --- /dev/null +++ b/crates/config/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "stract-config" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +distributed.workspace = true +rust-s3.workspace = true +serde.workspace = true diff --git a/core/src/config/defaults.rs b/crates/config/src/defaults.rs similarity index 100% rename from core/src/config/defaults.rs rename to crates/config/src/defaults.rs diff --git a/core/src/config/mod.rs b/crates/config/src/lib.rs similarity index 88% rename from core/src/config/mod.rs rename to crates/config/src/lib.rs index 4beb5b63c..eb1625298 100644 --- a/core/src/config/mod.rs +++ b/crates/config/src/lib.rs @@ -16,12 +16,12 @@ pub mod defaults; -use super::Result; -use crate::searcher::ShardId; +use distributed::member::ShardId; use serde::{Deserialize, Serialize}; use std::fs::File; use std::io::{self, BufRead}; use std::net::SocketAddr; +use std::path::PathBuf; #[derive(Debug, Deserialize, Clone)] pub struct IndexingLocalConfig { @@ -29,20 +29,20 @@ pub struct IndexingLocalConfig { pub skip_warc_files: Option, pub warc_source: WarcSource, pub batch_size: Option, - pub page_webgraph_path: Option, - pub output_path: Option, + pub page_webgraph_path: Option, + pub output_path: Option, pub host_centrality_threshold: Option, - pub topics_path: Option, - pub host_centrality_store_path: String, - pub page_centrality_store_path: Option, - pub safety_classifier_path: Option, + pub topics_path: Option, + pub host_centrality_store_path: PathBuf, + pub page_centrality_store_path: Option, + pub safety_classifier_path: Option, pub minimum_clean_words: Option, } #[derive(Debug, Deserialize, Clone)] pub struct WebgraphConstructConfig { - pub host_graph_base_path: String, - pub page_graph_base_path: String, + pub host_graph_base_path: PathBuf, + pub page_graph_base_path: PathBuf, pub warc_source: WarcSource, pub limit_warc_files: Option, pub batch_size: Option, @@ -57,7 +57,7 @@ pub enum WarcSource { } impl WarcSource { - pub fn paths(&self) -> Result> { + pub fn paths(&self) -> Result, s3::error::S3Error> { let mut warc_paths = Vec::new(); match &self { WarcSource::HTTP(config) => { @@ -186,14 +186,14 @@ impl Default for ApiThresholds { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ApiConfig { - pub queries_csv_path: String, + pub queries_csv_path: PathBuf, pub host: SocketAddr, pub prometheus_host: SocketAddr, - pub crossencoder_model_path: Option, - pub lambda_model_path: Option, - pub qa_model_path: Option, - pub bangs_path: String, - pub summarizer_path: String, + pub crossencoder_model_path: Option, + pub lambda_model_path: Option, + pub qa_model_path: Option, + pub bangs_path: PathBuf, + pub summarizer_path: PathBuf, pub query_store_db_host: Option, pub cluster_id: String, pub gossip_seed_nodes: Option>, @@ -241,11 +241,11 @@ pub struct SearchServerConfig { pub gossip_seed_nodes: Option>, pub gossip_addr: SocketAddr, pub shard_id: ShardId, - pub index_path: String, - pub entity_index_path: Option, - pub host_centrality_store_path: Option, - pub linear_model_path: Option, - pub lambda_model_path: Option, + pub index_path: PathBuf, + pub entity_index_path: Option, + pub host_centrality_store_path: Option, + pub linear_model_path: Option, + pub lambda_model_path: Option, pub host: SocketAddr, #[serde(default)] @@ -341,7 +341,7 @@ pub struct AliceLocalConfig { pub gossip_addr: SocketAddr, pub host: SocketAddr, - pub alice_path: String, + pub alice_path: PathBuf, pub accelerator: Option, /// base64 encoded pub encryption_key: String, @@ -350,9 +350,9 @@ pub struct AliceLocalConfig { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct WebgraphServerConfig { pub host: SocketAddr, - pub host_graph_path: String, - pub page_graph_path: String, - pub inbound_similarity_path: String, + pub host_graph_path: PathBuf, + pub page_graph_path: PathBuf, + pub inbound_similarity_path: PathBuf, pub cluster_id: String, pub gossip_seed_nodes: Option>, pub gossip_addr: SocketAddr, @@ -363,16 +363,16 @@ pub struct WebgraphServerConfig { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct WidgetsConfig { - pub thesaurus_paths: Vec, + pub thesaurus_paths: Vec, } #[derive(Debug, Serialize, Deserialize, Clone)] pub struct CrawlPlannerConfig { - pub page_harmonic_path: String, - pub host_harmonic_path: String, - pub page_graph_path: String, - pub host_graph_path: String, - pub output_path: String, + pub page_harmonic_path: PathBuf, + pub host_harmonic_path: PathBuf, + pub page_graph_path: PathBuf, + pub host_graph_path: PathBuf, + pub output_path: PathBuf, pub num_job_queues: usize, diff --git a/core/Cargo.toml b/crates/core/Cargo.toml similarity index 80% rename from core/Cargo.toml rename to crates/core/Cargo.toml index f1ec40abb..5807bf055 100644 --- a/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -1,9 +1,8 @@ [package] authors = ['Stract ApS. '] -default-run = "stract" edition = "2021" license = "AGPL-3.0-or-later" -name = "stract" +name = "stract-core" version = "0.1.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -12,22 +11,17 @@ version = "0.1.0" cors = [] default = ["cors", "with_alice", "libtorch"] dev = ["cors"] -libtorch = ["dep:tch"] +libtorch = ["dep:tch", "dep:stract-llm"] prod = ["cors", "libtorch"] -with_alice = ["libtorch"] - -[[bin]] -name = "stract" -path = "src/main.rs" +with_alice = ["libtorch", "dep:alice"] [dependencies] aes-gcm = {workspace = true} +alice = { workspace = true, optional = true } anyhow = {workspace = true} -async-channel = {workspace = true} async-stream = {workspace = true} async-trait = {workspace = true} axum = {workspace = true} -axum-extra = {workspace = true} axum-macros = {workspace = true} base64 = {workspace = true} bincode = {workspace = true} @@ -37,12 +31,15 @@ byteorder = {workspace = true} bzip2 = {workspace = true} chitchat = {workspace = true} chrono = {workspace = true} -clap = {workspace = true} +collector.workspace = true +crawler.workspace = true crossbeam-channel = {workspace = true} csv = {workspace = true} dashmap = {workspace = true} +distributed.workspace = true encoding_rs = {workspace = true} -eventsource-stream = {workspace = true} +entity_index.workspace = true +executor.workspace = true fend-core = {workspace = true} flate2 = {workspace = true} fst = {workspace = true} @@ -50,23 +47,26 @@ futures = {workspace = true} half = {workspace = true} hashbrown = {workspace = true} http = {workspace = true} +hyperloglog.workspace = true image = {workspace = true} +imager.workspace = true indicatif = {workspace = true} itertools = {workspace = true} -kuchiki = {path = "../kuchiki"} +kuchiki = {workspace = true} +kv.workspace = true lalrpop-util = {workspace = true} libc = {workspace = true} logos = {workspace = true} -lz-str = {workspace = true} lz4_flex = {workspace = true} +mapreduce.workspace = true md5 = {workspace = true} -memmap = {workspace = true} memmap2 = {workspace = true} mime = {workspace = true} min-max-heap = {workspace = true} +naive_bayes.workspace = true num_cpus = {workspace = true} once_cell = {workspace = true} -optics = {path = "../optics"} +optics = {workspace = true} parse_wiki_text = {workspace = true} publicsuffix = {workspace = true} quick-xml = {workspace = true} @@ -83,17 +83,25 @@ rocksdb = {workspace = true} rust-s3 = {workspace = true} rust-stemmers = {workspace = true} safetensors = {workspace = true} +schema.workspace = true scylla = {workspace = true} serde = {workspace = true} serde_json = {workspace = true} serde_urlencoded = {workspace = true} +simhash.workspace = true +sonic.workspace = true +spell.workspace = true +stdx.workspace = true +stract-config.workspace = true +stract-llm = { workspace = true, optional = true } +stract-query.workspace = true tantivy = {workspace = true} tch = {workspace = true, optional = true} thiserror = {workspace = true} +tokenizer.workspace = true tokenizers = {workspace = true} tokio = {workspace = true} tokio-stream = {workspace = true} -toml = {workspace = true} tower-http = {workspace = true} tracing = {workspace = true} tracing-subscriber = {workspace = true} @@ -102,17 +110,16 @@ urlencoding = {workspace = true} utoipa = {workspace = true} utoipa-swagger-ui = {workspace = true} uuid = {workspace = true} +warc.workspace = true +webgraph.workspace = true +webpage.workspace = true whatlang = {workspace = true} -[target.'cfg(not(target_env = "msvc"))'.dependencies] -tikv-jemallocator = {workspace = true} - [build-dependencies] lalrpop = {workspace = true} [dev-dependencies] criterion = {workspace = true} -insta = {workspace = true} maplit = {workspace = true} proptest = {workspace = true} proptest-derive = {workspace = true} diff --git a/core/benches/abstractive-summarizer.rs b/crates/core/benches/abstractive-summarizer.rs similarity index 97% rename from core/benches/abstractive-summarizer.rs rename to crates/core/benches/abstractive-summarizer.rs index 3ce01c745..f7f2fd840 100644 --- a/core/benches/abstractive-summarizer.rs +++ b/crates/core/benches/abstractive-summarizer.rs @@ -1,5 +1,5 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use stract::summarizer::{AbstractiveModel, AbstractiveSummarizer}; +use stract_llm::summarizer::{AbstractiveModel, AbstractiveSummarizer}; pub fn criterion_benchmark(c: &mut Criterion) { let model = diff --git a/core/benches/build-similarity.rs b/crates/core/benches/build-similarity.rs similarity index 81% rename from core/benches/build-similarity.rs rename to crates/core/benches/build-similarity.rs index fbac03c15..5054e137d 100644 --- a/core/benches/build-similarity.rs +++ b/crates/core/benches/build-similarity.rs @@ -1,5 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use stract::{ranking::inbound_similarity::InboundSimilarity, webgraph::WebgraphBuilder}; +use stract_core::ranking::inbound_similarity::InboundSimilarity; +use webgraph::WebgraphBuilder; const WEBGRAPH_PATH: &str = "data/webgraph"; diff --git a/core/benches/harmonic-centrality.rs b/crates/core/benches/harmonic-centrality.rs similarity index 74% rename from core/benches/harmonic-centrality.rs rename to crates/core/benches/harmonic-centrality.rs index 33a6393d2..c3413e5a9 100644 --- a/core/benches/harmonic-centrality.rs +++ b/crates/core/benches/harmonic-centrality.rs @@ -1,10 +1,10 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use stract::webgraph::{centrality::harmonic::HarmonicCentrality, WebgraphBuilder}; +use webgraph::{centrality::harmonic::HarmonicCentrality, WebgraphBuilder}; const WEBGRAPH_PATH: &str = "data/webgraph"; pub fn criterion_benchmark(c: &mut Criterion) { - let webgraph = WebgraphBuilder::new(WEBGRAPH_PATH).open(); + let webgraph = WebgraphBuilder::new(WEBGRAPH_PATH.as_ref()).open(); c.bench_function("Harmonic centrality calculation", |b| { b.iter(|| { for _ in 0..10 { diff --git a/core/benches/hyperloglog.rs b/crates/core/benches/hyperloglog.rs similarity index 93% rename from core/benches/hyperloglog.rs rename to crates/core/benches/hyperloglog.rs index 2a1ca1d5b..cc341db43 100644 --- a/core/benches/hyperloglog.rs +++ b/crates/core/benches/hyperloglog.rs @@ -1,5 +1,5 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use stract::hyperloglog::HyperLogLog; +use hyperloglog::HyperLogLog; pub fn criterion_benchmark(c: &mut Criterion) { c.bench_function("Hyperloglog", |b| { diff --git a/core/benches/naive-bayes.rs b/crates/core/benches/naive-bayes.rs similarity index 98% rename from core/benches/naive-bayes.rs rename to crates/core/benches/naive-bayes.rs index 7d55c7b27..6b271a20e 100644 --- a/core/benches/naive-bayes.rs +++ b/crates/core/benches/naive-bayes.rs @@ -1,6 +1,5 @@ use criterion::{criterion_group, criterion_main, Criterion}; use rand::seq::SliceRandom; -use stract::naive_bayes; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] enum Label { diff --git a/core/benches/search-preindexed-optic.rs b/crates/core/benches/search-preindexed-optic.rs similarity index 94% rename from core/benches/search-preindexed-optic.rs rename to crates/core/benches/search-preindexed-optic.rs index 05c576249..aaa1f10ed 100644 --- a/core/benches/search-preindexed-optic.rs +++ b/crates/core/benches/search-preindexed-optic.rs @@ -1,6 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; use optics::Optic; -use stract::{ +use stract_core::{ index::Index, searcher::{LocalSearcher, SearchQuery}, }; @@ -28,7 +28,7 @@ macro_rules! bench { } pub fn criterion_benchmark(c: &mut Criterion) { - let index = Index::open(INDEX_PATH).unwrap(); + let index = Index::open(INDEX_PATH.as_ref()).unwrap(); let searcher = LocalSearcher::new(index); let optic = include_str!("../../optics/testcases/samples/discussions.optic"); diff --git a/core/benches/search-preindexed.rs b/crates/core/benches/search-preindexed.rs similarity index 91% rename from core/benches/search-preindexed.rs rename to crates/core/benches/search-preindexed.rs index 80f0d4d73..6bbd5be6d 100644 --- a/core/benches/search-preindexed.rs +++ b/crates/core/benches/search-preindexed.rs @@ -2,7 +2,7 @@ use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; use optics::SiteRankings; -use stract::{ +use stract_core::{ index::Index, ranking::inbound_similarity::InboundSimilarity, searcher::{LocalSearcher, SearchQuery}, @@ -39,10 +39,10 @@ macro_rules! bench { } pub fn criterion_benchmark(c: &mut Criterion) { - let index = Index::open(INDEX_PATH).unwrap(); + let index = Index::open(INDEX_PATH.as_ref()).unwrap(); let mut searcher = LocalSearcher::new(index); searcher.set_inbound_similarity( - InboundSimilarity::open(Path::new(CENTRALITY_PATH).join("inbound_similarity")).unwrap(), + InboundSimilarity::open(&Path::new(CENTRALITY_PATH).join("inbound_similarity")).unwrap(), ); for _ in 0..1000 { diff --git a/core/benches/similar-sites.rs b/crates/core/benches/similar-sites.rs similarity index 76% rename from core/benches/similar-sites.rs rename to crates/core/benches/similar-sites.rs index 9f1bf9fde..5be9301ce 100644 --- a/core/benches/similar-sites.rs +++ b/crates/core/benches/similar-sites.rs @@ -1,22 +1,22 @@ use std::sync::Arc; use criterion::{criterion_group, criterion_main, Criterion}; -use stract::{ +use stract_core::{ ranking::inbound_similarity::InboundSimilarity, similar_sites::SimilarSitesFinder, - webgraph::WebgraphBuilder, }; +use webgraph::WebgraphBuilder; const WEBGRAPH_PATH: &str = "data/webgraph"; const INBOUND_SIMILARITY_PATH: &str = "data/centrality/inbound_similarity"; pub fn criterion_benchmark(c: &mut Criterion) { - let webgraph = Arc::new(WebgraphBuilder::new(WEBGRAPH_PATH).open()); - let inbound = InboundSimilarity::open(INBOUND_SIMILARITY_PATH).unwrap(); + let webgraph = Arc::new(WebgraphBuilder::new(WEBGRAPH_PATH.as_ref()).open()); + let inbound = InboundSimilarity::open(INBOUND_SIMILARITY_PATH.as_ref()).unwrap(); let finder = SimilarSitesFinder::new( webgraph, inbound, - stract::config::defaults::WebgraphServer::max_similar_sites(), + stract_config::defaults::WebgraphServer::max_similar_sites(), ); for _ in 0..10 { diff --git a/core/benches/spell-correction.rs b/crates/core/benches/spell-correction.rs similarity index 56% rename from core/benches/spell-correction.rs rename to crates/core/benches/spell-correction.rs index e2b6a65b7..4cddec2e9 100644 --- a/core/benches/spell-correction.rs +++ b/crates/core/benches/spell-correction.rs @@ -1,5 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use stract::{index::Index, searcher::SearchQuery, spell::Spell}; +use spell::Spell; +use stract_core::index::Index; const INDEX_PATH: &str = "data/index"; @@ -9,21 +10,14 @@ macro_rules! bench { desc.push_str($query); desc.push('\''); $c.bench_function(desc.as_str(), |b| { - b.iter(|| { - $spell - .correction(&SearchQuery { - query: $query.to_string(), - ..Default::default() - }) - .unwrap() - }) + b.iter(|| $spell.correction($query).unwrap()) }); }; } pub fn criterion_benchmark(c: &mut Criterion) { - let index = Index::open(INDEX_PATH).unwrap(); - let spell = Spell::for_index(&index); + let index = Index::open(INDEX_PATH.as_ref()).unwrap(); + let spell = Spell::for_searcher(index.inverted_index.tv_searcher()); for _ in 0..100 { bench!("asdf", spell, c); diff --git a/core/examples/alice.rs b/crates/core/examples/alice.rs similarity index 84% rename from core/examples/alice.rs rename to crates/core/examples/alice.rs index cb1babbe4..dba387433 100644 --- a/core/examples/alice.rs +++ b/crates/core/examples/alice.rs @@ -1,9 +1,10 @@ use std::io::Write; -use base64::Engine; -use stract::alice::{ +use alice::{ ExecutionState, BASE64_ENGINE, {AcceleratorConfig, Alice}, }; +use base64::Engine; +use stract_core::entrypoint::alice::StractSearcher; #[tokio::main] async fn main() { @@ -18,7 +19,7 @@ async fn main() { .unwrap(); let model = Alice::open( - "data/alice", + "data/alice".as_ref(), Some(AcceleratorConfig { layer_fraction: 1.0, quantize_fraction: 0.0, @@ -41,13 +42,12 @@ async fn main() { std::io::stdin().read_line(&mut input).unwrap(); let input = input.trim(); + let searcher = StractSearcher { + url: "http://localhost:3000/beta/api/search".to_string(), + optic_url: None, + }; let gen = model - .new_executor( - input, - last_state.clone(), - "http://localhost:3000/beta/api/search".to_string(), - None, - ) + .new_executor(input, last_state.clone(), searcher) .unwrap(); for n in gen { diff --git a/core/examples/mapreduce_manager.rs b/crates/core/examples/mapreduce_manager.rs similarity index 94% rename from core/examples/mapreduce_manager.rs rename to crates/core/examples/mapreduce_manager.rs index f2fc1d8f4..3839d4036 100644 --- a/core/examples/mapreduce_manager.rs +++ b/crates/core/examples/mapreduce_manager.rs @@ -1,7 +1,7 @@ use std::net::SocketAddr; +use mapreduce::{Manager, Map, Reduce, StatelessWorker}; use serde::{Deserialize, Serialize}; -use stract::mapreduce::{Manager, Map, Reduce, StatelessWorker}; use tracing::Level; use tracing_subscriber::FmtSubscriber; diff --git a/core/examples/mapreduce_worker.rs b/crates/core/examples/mapreduce_worker.rs similarity index 94% rename from core/examples/mapreduce_worker.rs rename to crates/core/examples/mapreduce_worker.rs index ccf2dc00d..a94c4b406 100644 --- a/core/examples/mapreduce_worker.rs +++ b/crates/core/examples/mapreduce_worker.rs @@ -1,7 +1,7 @@ use std::net::SocketAddr; +use mapreduce::{Map, Reduce, StatelessWorker, Worker}; use serde::{Deserialize, Serialize}; -use stract::mapreduce::{Map, Reduce, StatelessWorker, Worker}; use tracing::Level; use tracing_subscriber::FmtSubscriber; diff --git a/core/examples/print_inbound_links.rs b/crates/core/examples/print_inbound_links.rs similarity index 85% rename from core/examples/print_inbound_links.rs rename to crates/core/examples/print_inbound_links.rs index dccdddd2d..4db1156b8 100644 --- a/core/examples/print_inbound_links.rs +++ b/crates/core/examples/print_inbound_links.rs @@ -14,14 +14,12 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use stract::{ - ranking::inbound_similarity::InboundSimilarity, - webgraph::{Node, WebgraphBuilder}, -}; +use stract_core::ranking::inbound_similarity::InboundSimilarity; +use webgraph::{Node, WebgraphBuilder}; pub fn main() { - let graph = WebgraphBuilder::new("data/webgraph").open(); - let inbound = InboundSimilarity::open("data/centrality/inbound_similarity").unwrap(); + let graph = WebgraphBuilder::new("data/webgraph".as_ref()).open(); + let inbound = InboundSimilarity::open("data/centrality/inbound_similarity".as_ref()).unwrap(); for host in ["www.homedepot.com"] { println!("{host}:"); diff --git a/core/examples/print_similar_sites.rs b/crates/core/examples/print_similar_sites.rs similarity index 87% rename from core/examples/print_similar_sites.rs rename to crates/core/examples/print_similar_sites.rs index fac809c69..74a1de9f9 100644 --- a/core/examples/print_similar_sites.rs +++ b/crates/core/examples/print_similar_sites.rs @@ -14,11 +14,11 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use stract::{ +use stract_core::{ ranking::inbound_similarity::InboundSimilarity, similar_sites::{ScoredNode, SimilarSitesFinder}, - webgraph::WebgraphBuilder, }; +use webgraph::WebgraphBuilder; fn print_nodes(nodes: &[ScoredNode]) { for (i, node) in nodes.iter().enumerate() { @@ -39,13 +39,14 @@ fn print_top_nodes(liked_sites: &[&str], top_n: usize, similarity_finder: &Simil pub fn main() { const TOP_N: usize = 50; - let graph = WebgraphBuilder::new("data/webgraph").open(); - let inbound_similarity = InboundSimilarity::open("data/centrality/inbound_similarity").unwrap(); + let graph = WebgraphBuilder::new("data/webgraph".as_ref()).open(); + let inbound_similarity = + InboundSimilarity::open("data/centrality/inbound_similarity".as_ref()).unwrap(); let similarity_finder = SimilarSitesFinder::new( graph.into(), inbound_similarity, - stract::config::defaults::WebgraphServer::max_similar_sites(), + stract_config::defaults::WebgraphServer::max_similar_sites(), ); print_top_nodes( diff --git a/core/public_icann_suffix.dat b/crates/core/public_icann_suffix.dat similarity index 100% rename from core/public_icann_suffix.dat rename to crates/core/public_icann_suffix.dat diff --git a/core/public_suffix_list.dat b/crates/core/public_suffix_list.dat similarity index 100% rename from core/public_suffix_list.dat rename to crates/core/public_suffix_list.dat diff --git a/core/src/api/autosuggest.rs b/crates/core/src/api/autosuggest.rs similarity index 100% rename from core/src/api/autosuggest.rs rename to crates/core/src/api/autosuggest.rs diff --git a/core/src/api/docs.rs b/crates/core/src/api/docs.rs similarity index 82% rename from core/src/api/docs.rs rename to crates/core/src/api/docs.rs index 2984c83b2..d61506399 100644 --- a/core/src/api/docs.rs +++ b/crates/core/src/api/docs.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use super::{autosuggest, explore, search, sites, summarize, webgraph}; +use super::{autosuggest, explore, search, sites, summarize}; use axum::Router; use utoipa::{Modify, OpenApi}; use utoipa_swagger_ui::SwaggerUi; @@ -23,12 +23,12 @@ use utoipa_swagger_ui::SwaggerUi; #[openapi( paths( search::api, - webgraph::host::similar, - webgraph::host::knows, - webgraph::host::ingoing_hosts, - webgraph::host::outgoing_hosts, - webgraph::page::ingoing_pages, - webgraph::page::outgoing_pages, + super::webgraph::host::similar, + super::webgraph::host::knows, + super::webgraph::host::ingoing_hosts, + super::webgraph::host::outgoing_hosts, + super::webgraph::page::ingoing_pages, + super::webgraph::page::outgoing_pages, autosuggest::route, summarize::summarize_route, sites::sites_export_optic, @@ -36,7 +36,7 @@ use utoipa_swagger_ui::SwaggerUi; ), components( schemas( - crate::webpage::region::Region, + webpage::region::Region, optics::SiteRankings, search::ApiSearchQuery, search::ApiSearchResult, @@ -55,10 +55,10 @@ use utoipa_swagger_ui::SwaggerUi; crate::snippet::TextSnippetFragment, crate::snippet::TextSnippetFragmentKind, - crate::entity_index::entity::EntitySnippet, - crate::entity_index::entity::EntitySnippetFragment, + entity_index::entity::EntitySnippet, + entity_index::entity::EntitySnippetFragment, - crate::bangs::UrlWrapper, + stract_query::bangs::UrlWrapper, crate::widgets::Widget, crate::widgets::calculator::Calculation, @@ -71,11 +71,11 @@ use utoipa_swagger_ui::SwaggerUi; crate::widgets::thesaurus::PartOfSpeechMeaning, crate::ranking::signal::SignalScore, - crate::bangs::BangHit, - crate::bangs::Bang, + stract_query::bangs::BangHit, + stract_query::bangs::Bang, - webgraph::host::SimilarSitesParams, - webgraph::KnowsSite, + super::webgraph::host::SimilarSitesParams, + super::webgraph::KnowsSite, crate::entrypoint::webgraph_server::ScoredSite, autosuggest::Suggestion, @@ -83,8 +83,8 @@ use utoipa_swagger_ui::SwaggerUi; sites::SitesExportOpticParams, explore::ExploreExportOpticParams, - crate::webgraph::Node, - crate::webgraph::FullEdge, + webgraph::Node, + webgraph::FullEdge, ), ), modifiers(&ApiModifier), diff --git a/core/src/api/explore.rs b/crates/core/src/api/explore.rs similarity index 100% rename from core/src/api/explore.rs rename to crates/core/src/api/explore.rs diff --git a/core/src/api/improvement.rs b/crates/core/src/api/improvement.rs similarity index 100% rename from core/src/api/improvement.rs rename to crates/core/src/api/improvement.rs diff --git a/core/src/api/metrics.rs b/crates/core/src/api/metrics.rs similarity index 100% rename from core/src/api/metrics.rs rename to crates/core/src/api/metrics.rs diff --git a/core/src/api/mod.rs b/crates/core/src/api/mod.rs similarity index 94% rename from core/src/api/mod.rs rename to crates/core/src/api/mod.rs index 99e1561d4..eb9adbe32 100644 --- a/core/src/api/mod.rs +++ b/crates/core/src/api/mod.rs @@ -18,27 +18,27 @@ //! All http requests are handled using axum. use axum::{body::Body, extract, middleware, Router}; +use distributed::{ + cluster::Cluster, + member::{Member, Service}, +}; +use stdx::leaky_queue::LeakyQueue; +use stract_config::ApiConfig; +use stract_query::bangs::Bangs; use tokio::sync::Mutex; use tower_http::compression::CompressionLayer; use crate::{ autosuggest::Autosuggest, - bangs::Bangs, - config::ApiConfig, - distributed::{ - cluster::Cluster, - member::{Member, Service}, - }, improvement::{store_improvements_loop, ImprovementEvent}, - leaky_queue::LeakyQueue, ranking::models::lambdamart::LambdaMART, searcher::api::ApiSearcher, }; #[cfg(feature = "libtorch")] -use crate::{ - qa_model::QaModel, ranking::models::cross_encoder::CrossEncoderModel, summarizer::Summarizer, -}; +use crate::ranking::models::cross_encoder::CrossEncoderModel; +#[cfg(feature = "libtorch")] +use stract_llm::{qa_model::QaModel, summarizer::Summarizer}; use anyhow::Result; use std::{net::SocketAddr, sync::Arc}; @@ -98,7 +98,7 @@ pub async fn favicon() -> impl IntoResponse { Response::builder() .status(StatusCode::OK) .body(Body::from( - include_bytes!("../../../frontend/static/favicon.ico").to_vec(), + include_bytes!("../../../../frontend/static/favicon.ico").to_vec(), )) .unwrap() } @@ -136,7 +136,7 @@ pub async fn router(config: &ApiConfig, counters: Counters) -> Result { let state = { let mut cross_encoder = None; - if let Some(path) = config.crossencoder_model_path.as_ref() { + if let Some(path) = &config.crossencoder_model_path { cross_encoder = Some(CrossEncoderModel::open(path)?); } diff --git a/core/src/api/search.rs b/crates/core/src/api/search.rs similarity index 97% rename from core/src/api/search.rs rename to crates/core/src/api/search.rs index 41b1e46fe..3c5273db9 100644 --- a/core/src/api/search.rs +++ b/crates/core/src/api/search.rs @@ -14,20 +14,18 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::config::defaults; use http::StatusCode; use optics::{Optic, SiteRankings}; use std::sync::Arc; +use stract_config::defaults; +use stract_query::bangs::BangHit; use utoipa::ToSchema; +use webpage::region::Region; use axum::Json; use axum_macros::debug_handler; -use crate::{ - bangs::BangHit, - searcher::{self, SearchQuery, SearchResult, WebsitesResult}, - webpage::region::Region, -}; +use crate::searcher::{self, SearchQuery, SearchResult, WebsitesResult}; use super::State; diff --git a/core/src/api/sites.rs b/crates/core/src/api/sites.rs similarity index 100% rename from core/src/api/sites.rs rename to crates/core/src/api/sites.rs diff --git a/core/src/api/summarize.rs b/crates/core/src/api/summarize.rs similarity index 100% rename from core/src/api/summarize.rs rename to crates/core/src/api/summarize.rs diff --git a/core/src/api/user_count.rs b/crates/core/src/api/user_count.rs similarity index 99% rename from core/src/api/user_count.rs rename to crates/core/src/api/user_count.rs index 96cdc3f7d..efc621d38 100644 --- a/core/src/api/user_count.rs +++ b/crates/core/src/api/user_count.rs @@ -18,11 +18,11 @@ use std::sync::Arc; use anyhow::{anyhow, Result}; use chrono::{NaiveDateTime, Utc}; +use hyperloglog::HyperLogLog; use ring::rand::SecureRandom; use ring::{digest, pbkdf2, rand}; use std::sync::Mutex; -use crate::hyperloglog::HyperLogLog; use crate::metrics::Counter; pub trait Frequency: Clone + Copy + Default { diff --git a/core/src/api/webgraph.rs b/crates/core/src/api/webgraph.rs similarity index 98% rename from core/src/api/webgraph.rs rename to crates/core/src/api/webgraph.rs index bc57c76c5..208a15f1c 100644 --- a/core/src/api/webgraph.rs +++ b/crates/core/src/api/webgraph.rs @@ -17,14 +17,12 @@ use std::{net::SocketAddr, sync::Arc, time::Duration}; use axum::{extract, response::IntoResponse, Json}; +use distributed::{cluster::Cluster, member::Service, retry_strategy::ExponentialBackoff}; use http::StatusCode; use utoipa::{IntoParams, ToSchema}; +use webgraph::{FullEdge, Node}; -use crate::{ - distributed::{cluster::Cluster, member::Service, retry_strategy::ExponentialBackoff, sonic}, - entrypoint::webgraph_server::GraphLevel, - webgraph::{FullEdge, Node}, -}; +use crate::entrypoint::webgraph_server::GraphLevel; use super::State; diff --git a/core/src/autosuggest.rs b/crates/core/src/autosuggest.rs similarity index 96% rename from core/src/autosuggest.rs rename to crates/core/src/autosuggest.rs index f5ce584b7..aace0dea9 100644 --- a/core/src/autosuggest.rs +++ b/crates/core/src/autosuggest.rs @@ -29,7 +29,7 @@ pub struct Autosuggest { } impl Autosuggest { - pub fn load_csv>(path: P) -> Result { + pub fn load_csv(path: &Path) -> Result { let mut queries: Vec = Vec::new(); let mut rdr = csv::Reader::from_path(path)?; diff --git a/core/src/entrypoint/alice.rs b/crates/core/src/entrypoint/alice.rs similarity index 66% rename from core/src/entrypoint/alice.rs rename to crates/core/src/entrypoint/alice.rs index b217bb197..a110a2054 100644 --- a/core/src/entrypoint/alice.rs +++ b/crates/core/src/entrypoint/alice.rs @@ -15,8 +15,14 @@ // along with this program. If not, see . use aes_gcm::{aead::OsRng, Aes256Gcm, KeyInit}; +use anyhow::bail; +use itertools::Itertools; use std::{convert::Infallible, net::SocketAddr, sync::Arc, time::Duration}; +use url::Url; +use alice::{ + Alice, EncodedEncryptedState, EncryptedState, Searcher, SimplifiedWebsite, BASE64_ENGINE, +}; use axum::{ extract, response::{ @@ -27,20 +33,17 @@ use axum::{ Router, }; use base64::Engine; +use distributed::{ + cluster::Cluster, + member::{Member, Service}, +}; +use stract_config::AliceLocalConfig; use tokio::sync::Mutex; use tokio_stream::Stream; use tokio_stream::StreamExt as _; use tracing::info; -use crate::{ - alice::{Alice, EncodedEncryptedState, EncryptedState, BASE64_ENGINE}, - config::AliceLocalConfig, - distributed::{ - cluster::Cluster, - member::{Member, Service}, - }, - ttl_cache::TTLCache, -}; +use crate::{api::search::ApiSearchQuery, searcher::SearchResult, ttl_cache::TTLCache}; pub struct State { pub alice: Alice, @@ -90,6 +93,68 @@ pub struct AliceParams { pub prev_state: Option, } +pub struct StractSearcher { + pub url: String, + pub optic_url: Option, +} + +impl Searcher for StractSearcher { + fn search(&self, query: &str) -> crate::Result> { + let res = { + let optic = self + .optic_url + .as_ref() + .and_then(|url| reqwest::blocking::get(url).ok().and_then(|r| r.text().ok())); + + let client = reqwest::blocking::Client::new(); + let query = ApiSearchQuery { + query: query.trim().to_string(), + num_results: Some(3), + optic, + page: None, + selected_region: None, + site_rankings: None, + return_ranking_signals: false, + flatten_response: false, + safe_search: Some(false), + fetch_discussions: false, + count_results: false, + }; + tracing::debug!("searching at {:?}: {:#?}", self.url, query); + + let res: SearchResult = client.post(&self.url).json(&query).send()?.json()?; + + match res { + SearchResult::Websites(res) => res, + SearchResult::Bang(_) => bail!("Unexpected search result"), + } + }; + + let mut websites = Vec::new(); + + for website in res.webpages { + let webpage = website; + let text = webpage + .snippet + .text() + .map(|t| t.fragments.iter().map(|f| f.text()).join("")) + .unwrap_or_default(); + + let url = Url::parse(&webpage.url).unwrap(); + websites.push(SimplifiedWebsite { + title: webpage.title, + text, + site: url.host_str().unwrap_or_default().to_string(), + url: url.to_string(), + }); + } + + tracing::debug!("search result: {:#?}", websites); + + Ok(websites) + } +} + pub async fn route( extract::State(state): extract::State>, extract::Query(params): extract::Query, @@ -128,14 +193,13 @@ pub async fn route( http::StatusCode::INTERNAL_SERVER_ERROR })?; + let searcher = StractSearcher { + url: format!("http://{}/beta/api/search", search_addr), + optic_url: params.optic, + }; let executor = state .alice - .new_executor( - ¶ms.message, - prev_state, - format!("http://{}/beta/api/search", search_addr), - params.optic, - ) + .new_executor(¶ms.message, prev_state, searcher) .map_err(|e| { info!("error creating executor: {}", e); http::StatusCode::INTERNAL_SERVER_ERROR @@ -174,7 +238,7 @@ pub async fn run(config: AliceLocalConfig) -> Result<(), anyhow::Error> { info!("starting alice"); let alice = Alice::open( - &config.alice_path, + config.alice_path.as_ref(), config.accelerator.clone().map(|acc| acc.into()), &key, )?; diff --git a/core/src/entrypoint/api.rs b/crates/core/src/entrypoint/api.rs similarity index 97% rename from core/src/entrypoint/api.rs rename to crates/core/src/entrypoint/api.rs index 9a6231fb2..003934178 100644 --- a/core/src/entrypoint/api.rs +++ b/crates/core/src/entrypoint/api.rs @@ -20,11 +20,10 @@ use anyhow::Result; use crate::{ api::{metrics_router, router, user_count, Counters}, - config, metrics::Label, }; -pub async fn run(config: config::ApiConfig) -> Result<()> { +pub async fn run(config: stract_config::ApiConfig) -> Result<()> { let search_counter_success = crate::metrics::Counter::default(); let search_counter_fail = crate::metrics::Counter::default(); let explore_counter = crate::metrics::Counter::default(); diff --git a/core/src/entrypoint/autosuggest_scrape.rs b/crates/core/src/entrypoint/autosuggest_scrape.rs similarity index 94% rename from core/src/entrypoint/autosuggest_scrape.rs rename to crates/core/src/entrypoint/autosuggest_scrape.rs index be05a2269..9089fde01 100644 --- a/core/src/entrypoint/autosuggest_scrape.rs +++ b/crates/core/src/entrypoint/autosuggest_scrape.rs @@ -84,8 +84,8 @@ impl FromStr for Gl { } } -fn save_queries>(queries: &HashSet, path: P) -> Result<()> { - let mut wtr = Writer::from_path(&path)?; +fn save_queries(queries: &HashSet, path: &Path) -> Result<()> { + let mut wtr = Writer::from_path(path)?; let mut queries: Vec<_> = queries.iter().collect(); queries.sort(); @@ -99,11 +99,11 @@ fn save_queries>(queries: &HashSet, path: P) -> Result<() Ok(()) } -pub fn run>( +pub fn run( queries_to_scrape: usize, gl: Gl, ms_sleep_between_req: u64, - output_dir: P, + output_dir: &Path, ) -> Result<()> { let mut queries = HashSet::new(); let mut queue = VecDeque::new(); @@ -120,9 +120,7 @@ pub fn run>( queue.push_back(c.to_string()); } - let path = output_dir - .as_ref() - .join(format!("queries_{:}.csv", gl.to_string().as_str())); + let path = output_dir.join(format!("queries_{:}.csv", gl.to_string().as_str())); let mut queries_since_last_save = 0; diff --git a/core/src/entrypoint/centrality.rs b/crates/core/src/entrypoint/centrality.rs similarity index 78% rename from core/src/entrypoint/centrality.rs rename to crates/core/src/entrypoint/centrality.rs index 3e6279bbc..bafefd837 100644 --- a/core/src/entrypoint/centrality.rs +++ b/crates/core/src/entrypoint/centrality.rs @@ -15,18 +15,16 @@ // along with this program. If not, see . use anyhow::Result; +use kv::{rocksdb_store::RocksDbStore, Kv}; use std::{cmp::Reverse, collections::BinaryHeap, fs::File, path::Path}; - -use crate::{ - kv::{rocksdb_store::RocksDbStore, Kv}, - ranking::inbound_similarity::InboundSimilarity, - webgraph::{ - centrality::{derived_harmonic::DerivedCentrality, harmonic::HarmonicCentrality}, - Node, WebgraphBuilder, - }, +use webgraph::{ + centrality::{derived_harmonic::DerivedCentrality, harmonic::HarmonicCentrality}, + Node, WebgraphBuilder, }; -fn store_csv>(data: Vec<(Node, f64)>, output: P) { +use crate::ranking::inbound_similarity::InboundSimilarity; + +fn store_csv(data: Vec<(Node, f64)>, output: &Path) { let csv_file = File::options() .write(true) .create(true) @@ -69,11 +67,11 @@ impl Ord for SortableFloat { pub struct Centrality; impl Centrality { - pub fn build_harmonic>(webgraph_path: P, base_output: P) { + pub fn build_harmonic(webgraph_path: &Path, base_output: &Path) { tracing::info!("Building harmonic centrality"); let graph = WebgraphBuilder::new(webgraph_path).open(); let harmonic_centrality = HarmonicCentrality::calculate(&graph); - let store = RocksDbStore::open(base_output.as_ref().join("harmonic")); + let store = RocksDbStore::open(&base_output.join("harmonic")); for (node_id, centrality) in harmonic_centrality.iter() { store.insert(*node_id, centrality); @@ -98,32 +96,31 @@ impl Centrality { .map(|(score, id)| (graph.id2node(&id).unwrap(), score.0 .0)) .collect(); - store_csv(harmonics, base_output.as_ref().join("harmonic.csv")); + store_csv(harmonics, &base_output.join("harmonic.csv")); } - pub fn build_similarity>(webgraph_path: P, base_output: P) { + pub fn build_similarity(webgraph_path: &Path, base_output: &Path) { tracing::info!("Building inbound similarity"); let graph = WebgraphBuilder::new(webgraph_path).open(); let sim = InboundSimilarity::build(&graph); - sim.save(base_output.as_ref().join("inbound_similarity")) - .unwrap(); + sim.save(&base_output.join("inbound_similarity")).unwrap(); } - pub fn build_derived_harmonic>( - webgraph_path: P, - host_centrality_path: P, - base_output: P, + pub fn build_derived_harmonic( + webgraph_path: &Path, + host_centrality_path: &Path, + base_output: &Path, ) -> Result<()> { tracing::info!("Building derived harmonic centrality"); let graph = WebgraphBuilder::new(webgraph_path).single_threaded().open(); - let host_centrality = RocksDbStore::open(host_centrality_path.as_ref().join("harmonic")); + let host_centrality = RocksDbStore::open(&host_centrality_path.join("harmonic")); let derived = DerivedCentrality::build( &host_centrality, &graph, - base_output.as_ref().join("derived_harmonic"), + &base_output.join("derived_harmonic"), )?; let mut top_nodes = BinaryHeap::new(); @@ -144,7 +141,7 @@ impl Centrality { .map(|(score, id)| (graph.id2node(&id).unwrap(), score.0 .0)) .collect(); - store_csv(derived, base_output.as_ref().join("derived_centrality.csv")); + store_csv(derived, &base_output.join("derived_centrality.csv")); Ok(()) } diff --git a/core/src/entrypoint/configure.rs b/crates/core/src/entrypoint/configure.rs similarity index 100% rename from core/src/entrypoint/configure.rs rename to crates/core/src/entrypoint/configure.rs diff --git a/crates/core/src/entrypoint/crawler.rs b/crates/core/src/entrypoint/crawler.rs new file mode 100644 index 000000000..f0ae17855 --- /dev/null +++ b/crates/core/src/entrypoint/crawler.rs @@ -0,0 +1,80 @@ +// Stract is an open source web search engine. +// Copyright (C) 2023 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use std::{net::SocketAddr, sync::Arc}; + +use crawler::{coordinator, planner::make_crawl_plan, router, CrawlCoordinator, Crawler}; +use kv::rocksdb_store::RocksDbStore; +use webgraph::WebgraphBuilder; + +use crate::Result; + +pub async fn worker(config: stract_config::CrawlerConfig) -> Result<()> { + let crawler = Crawler::new(config).await?; + + crawler.run().await; + + Ok(()) +} + +pub async fn coordinator(config: stract_config::CrawlCoordinatorConfig) -> Result<()> { + let coordinator = Arc::new(CrawlCoordinator::new(config.job_queue.as_ref())?); + + let addr: SocketAddr = config.host; + let server = coordinator::CoordinatorService { coordinator } + .bind(addr) + .await + .unwrap(); + + tracing::info!("Crawl coordinator listening on {}", addr); + + loop { + let _ = server.accept().await; + } +} + +pub async fn router(config: stract_config::CrawlRouterConfig) -> Result<()> { + let router = router::Router::new(config.coordinator_addrs.clone()).await?; + + let addr: SocketAddr = config.host; + + let server = router::RouterService { router }.bind(addr).await.unwrap(); + + tracing::info!("Crawl router listening on {}", addr); + + loop { + let _ = server.accept().await; + } +} + +pub fn planner(config: stract_config::CrawlPlannerConfig) -> Result<()> { + let page_centrality = RocksDbStore::open(config.page_harmonic_path.as_ref()); + let host_centrality = RocksDbStore::open(config.host_harmonic_path.as_ref()); + let page_graph = WebgraphBuilder::new(config.page_graph_path.as_ref()).open(); + let host_graph = WebgraphBuilder::new(config.host_graph_path.as_ref()).open(); + let output_path = config.output_path.clone(); + + make_crawl_plan( + host_centrality, + page_centrality, + host_graph, + page_graph, + config, + output_path.as_ref(), + )?; + + Ok(()) +} diff --git a/core/src/entrypoint/dmoz_parser.rs b/crates/core/src/entrypoint/dmoz_parser.rs similarity index 96% rename from core/src/entrypoint/dmoz_parser.rs rename to crates/core/src/entrypoint/dmoz_parser.rs index e54a35e4a..9a99802e3 100644 --- a/core/src/entrypoint/dmoz_parser.rs +++ b/crates/core/src/entrypoint/dmoz_parser.rs @@ -122,7 +122,7 @@ impl Page { } } -pub fn parse>(dmoz_file: P) -> Result { +pub fn parse(dmoz_file: &Path) -> Result { let file = File::open(dmoz_file)?; let reader = BufReader::new(file); let reader = BufReader::new(MultiGzDecoder::new(reader)); @@ -155,7 +155,7 @@ pub fn parse>(dmoz_file: P) -> Result>(dmoz_file: P, output_path: P) -> Result<()> { +pub fn run(dmoz_file: &Path, output_path: &Path) -> Result<()> { let mapper = parse(dmoz_file)?; mapper.save(output_path)?; diff --git a/core/src/entrypoint/indexer.rs b/crates/core/src/entrypoint/indexer.rs similarity index 84% rename from core/src/entrypoint/indexer.rs rename to crates/core/src/entrypoint/indexer.rs index feb7b2241..85aef2e4d 100644 --- a/core/src/entrypoint/indexer.rs +++ b/crates/core/src/entrypoint/indexer.rs @@ -15,51 +15,49 @@ // along with this program. If not, see . use chrono::Utc; use rayon::prelude::*; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::thread; use itertools::Itertools; +use kv::{rocksdb_store::RocksDbStore, Kv}; +use mapreduce::{Map, Reduce, Worker}; use serde::{Deserialize, Serialize}; use tokio::pin; use tracing::{debug, info, trace, warn}; +use warc::PayloadType; +use webgraph::{Node, NodeID, Webgraph, WebgraphBuilder}; +use webpage::{safety_classifier, Html, Webpage}; -use crate::config; use crate::entrypoint::download_all_warc_files; use crate::index::{FrozenIndex, Index}; -use crate::kv::rocksdb_store::RocksDbStore; -use crate::kv::Kv; -use crate::mapreduce::{Map, Reduce, Worker}; use crate::ranking::SignalAggregator; -use crate::warc::PayloadType; -use crate::webgraph::{Node, NodeID, Webgraph, WebgraphBuilder}; -use crate::webpage::{safety_classifier, Html, Webpage}; use crate::{human_website_annotations, Result}; pub struct Indexer {} #[derive(Debug, Serialize, Deserialize, Clone)] pub enum JobConfig { - Http(config::HttpConfig), - Local(config::LocalConfig), - S3(config::S3Config), + Http(stract_config::HttpConfig), + Local(stract_config::LocalConfig), + S3(stract_config::S3Config), } -impl From for JobConfig { - fn from(value: config::WarcSource) -> Self { +impl From for JobConfig { + fn from(value: stract_config::WarcSource) -> Self { match value { - config::WarcSource::HTTP(config) => JobConfig::Http(config), - config::WarcSource::Local(config) => JobConfig::Local(config), - config::WarcSource::S3(config) => JobConfig::S3(config), + stract_config::WarcSource::HTTP(config) => JobConfig::Http(config), + stract_config::WarcSource::Local(config) => JobConfig::Local(config), + stract_config::WarcSource::S3(config) => JobConfig::S3(config), } } } -impl From for config::WarcSource { +impl From for stract_config::WarcSource { fn from(value: JobConfig) -> Self { match value { - JobConfig::Http(config) => config::WarcSource::HTTP(config), - JobConfig::Local(config) => config::WarcSource::Local(config), - JobConfig::S3(config) => config::WarcSource::S3(config), + JobConfig::Http(config) => stract_config::WarcSource::HTTP(config), + JobConfig::Local(config) => stract_config::WarcSource::Local(config), + JobConfig::S3(config) => stract_config::WarcSource::S3(config), } } } @@ -68,7 +66,7 @@ impl From for config::WarcSource { pub struct Job { pub source_config: JobConfig, pub warc_paths: Vec, - pub base_path: String, + pub base_path: PathBuf, pub host_centrality_threshold: Option, pub minimum_clean_words: Option, } @@ -83,18 +81,18 @@ pub struct IndexingWorker { impl IndexingWorker { pub fn new( - host_centrality_store_path: String, - page_centrality_store_path: Option, - webgraph_path: Option, - topics_path: Option, - safety_classifier_path: Option, + host_centrality_store_path: &Path, + page_centrality_store_path: Option<&Path>, + webgraph_path: Option<&Path>, + topics_path: Option<&Path>, + safety_classifier_path: Option<&Path>, ) -> Self { Self { host_centrality_store: RocksDbStore::open( - Path::new(&host_centrality_store_path).join("harmonic"), + &Path::new(&host_centrality_store_path).join("harmonic"), ), page_centrality_store: page_centrality_store_path - .map(|p| RocksDbStore::open(Path::new(&p).join("derived_harmonic"))), + .map(|p| RocksDbStore::open(&p.join("derived_harmonic"))), webgraph: webgraph_path.map(|path| WebgraphBuilder::new(path).single_threaded().open()), topics: topics_path.map(|path| human_website_annotations::Mapper::open(path).unwrap()), safety_classifier: safety_classifier_path @@ -112,9 +110,9 @@ pub fn process_job(job: &Job, worker: &IndexingWorker) -> Index { info!("processing {}", name); - let mut index = Index::open(Path::new(&job.base_path).join(name)).unwrap(); + let mut index = Index::open(&Path::new(&job.base_path).join(name)).unwrap(); - let source: config::WarcSource = job.source_config.clone().into(); + let source: stract_config::WarcSource = job.source_config.clone().into(); let warc_files = download_all_warc_files(&job.warc_paths, &source); pin!(warc_files); @@ -288,10 +286,10 @@ pub fn process_job(job: &Job, worker: &IndexingWorker) -> Index { } #[derive(Debug, Serialize, Deserialize)] -pub struct IndexPointer(String); +pub struct IndexPointer(PathBuf); -impl From for IndexPointer { - fn from(path: String) -> Self { +impl From for IndexPointer { + fn from(path: PathBuf) -> Self { IndexPointer(path) } } @@ -340,17 +338,17 @@ impl Reduce for Index { } impl Indexer { - pub fn run(config: &config::IndexingLocalConfig) -> Result<()> { + pub fn run(config: &stract_config::IndexingLocalConfig) -> Result<()> { let warc_paths = config.warc_source.paths()?; let job_config: JobConfig = config.warc_source.clone().into(); let worker = IndexingWorker::new( - config.host_centrality_store_path.clone(), - config.page_centrality_store_path.clone(), - config.page_webgraph_path.clone(), - config.topics_path.clone(), - config.safety_classifier_path.clone(), + &config.host_centrality_store_path, + config.page_centrality_store_path.as_deref(), + config.page_webgraph_path.as_deref(), + config.topics_path.as_deref(), + config.safety_classifier_path.as_deref(), ); let indexes = warc_paths @@ -369,7 +367,7 @@ impl Indexer { base_path: config .output_path .clone() - .unwrap_or_else(|| "data/index".to_string()), + .unwrap_or_else(|| PathBuf::from("data/index")), minimum_clean_words: config.minimum_clean_words, }) .map(|job| { @@ -401,11 +399,11 @@ impl Indexer { threads.push(thread::spawn(move || { let mut it = indexes.into_iter(); - let mut index = Index::open(it.next().unwrap().0).unwrap(); + let mut index = Index::open(it.next().unwrap().0.as_ref()).unwrap(); for other in it { let other_path = other.0; - let other = Index::open(&other_path).unwrap(); + let other = Index::open(other_path.as_ref()).unwrap(); index = index.merge(other); std::fs::remove_dir_all(other_path).unwrap(); diff --git a/crates/core/src/entrypoint/mod.rs b/crates/core/src/entrypoint/mod.rs new file mode 100644 index 000000000..01e9fcc5d --- /dev/null +++ b/crates/core/src/entrypoint/mod.rs @@ -0,0 +1,191 @@ +// Stract is an open source web search engine. +// Copyright (C) 2023 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! The entrypoint module contains all entrypoints that runs the executables. +#[cfg(feature = "with_alice")] +pub mod alice; +pub mod api; +pub mod autosuggest_scrape; +mod centrality; +#[cfg(feature = "dev")] +pub mod configure; +pub mod crawler; +pub mod dmoz_parser; +pub mod indexer; +pub mod safety_classifier; +pub mod search_server; +mod webgraph; +pub mod webgraph_server; + +pub use centrality::Centrality; +pub use entity_index::builder::EntityIndexer; +pub use indexer::Indexer; +use tracing::{debug, log::error}; +use warc::WarcFile; +pub use webgraph::Webgraph; + +fn download_all_warc_files<'a>( + warc_paths: &'a [String], + source: &'a stract_config::WarcSource, +) -> impl Iterator + 'a { + let warc_paths: Vec<_> = warc_paths + .iter() + .map(|warc_path| warc_path.to_string()) + .collect(); + + warc_paths.into_iter().filter_map(|warc_path| { + debug!("downloading warc file {}", &warc_path); + let res = warc_download::download(source, &warc_path); + + if let Err(err) = res { + error!("error while downloading: {:?}", err); + return None; + } + + debug!("finished downloading"); + + Some(res.unwrap()) + }) +} + +mod warc_download { + use std::{ + fs::File, + io::{BufReader, Cursor, Read, Seek, Write}, + path::Path, + thread::sleep, + time::Duration, + }; + + use distributed::retry_strategy::ExponentialBackoff; + use stract_config::{S3Config, WarcSource}; + use tracing::{debug, trace}; + use warc::WarcFile; + + use crate::{Error, Result}; + + pub(super) fn download(source: &WarcSource, warc_path: &str) -> Result { + let mut cursor = Cursor::new(Vec::new()); + download_into_buf(source, warc_path, &mut cursor)?; + cursor.rewind()?; + + let mut buf = Vec::new(); + cursor.read_to_end(&mut buf)?; + + Ok(WarcFile::new(buf)) + } + + fn download_into_buf( + source: &WarcSource, + warc_path: &str, + buf: &mut W, + ) -> Result<()> { + for dur in ExponentialBackoff::from_millis(10) + .with_limit(Duration::from_secs(30)) + .take(35) + { + let res = match source.clone() { + WarcSource::HTTP(config) => download_from_http(warc_path, config.base_url, buf), + WarcSource::Local(config) => load_from_folder(warc_path, &config.folder, buf), + WarcSource::S3(config) => download_from_s3(warc_path, &config, buf), + }; + + if res.is_ok() { + return Ok(()); + } else { + trace!("Error {:?}", res); + } + + debug!("warc download failed: {:?}", res.err().unwrap()); + debug!("retrying in {} ms", dur.as_millis()); + + sleep(dur); + } + + Err(Error::DownloadFailed.into()) + } + + fn load_from_folder(name: &str, folder: &str, buf: &mut W) -> Result<()> { + let f = File::open(Path::new(folder).join(name))?; + let mut reader = BufReader::new(f); + + buf.rewind()?; + + std::io::copy(&mut reader, buf)?; + + Ok(()) + } + + fn download_from_http( + warc_path: &str, + base_url: String, + buf: &mut W, + ) -> Result<()> { + let mut url = base_url; + if !url.ends_with('/') { + url += "/"; + } + url += warc_path; + + let client = reqwest::blocking::ClientBuilder::new() + .tcp_keepalive(None) + .pool_idle_timeout(Duration::from_secs(30 * 60)) + .timeout(Duration::from_secs(30 * 60)) + .connect_timeout(Duration::from_secs(30 * 60)) + .build()?; + let res = client.get(url).send()?; + + if res.status().as_u16() != 200 { + return Err(Error::DownloadFailed.into()); + } + + let bytes = res.bytes()?; + + buf.rewind()?; + std::io::copy(&mut &bytes[..], buf)?; + + Ok(()) + } + + fn download_from_s3( + warc_path: &str, + config: &S3Config, + buf: &mut W, + ) -> Result<()> { + let bucket = s3::Bucket::new( + &config.bucket, + s3::Region::Custom { + region: "".to_string(), + endpoint: config.endpoint.clone(), + }, + s3::creds::Credentials { + access_key: Some(config.access_key.clone()), + secret_key: Some(config.secret_key.clone()), + security_token: None, + session_token: None, + expiration: None, + }, + )? + .with_path_style() + .with_request_timeout(Duration::from_secs(30 * 60)); + + let res = bucket.get_object_blocking(warc_path)?; + + buf.write_all(res.bytes())?; + + Ok(()) + } +} diff --git a/core/src/entrypoint/safety_classifier.rs b/crates/core/src/entrypoint/safety_classifier.rs similarity index 83% rename from core/src/entrypoint/safety_classifier.rs rename to crates/core/src/entrypoint/safety_classifier.rs index e3ddd37dd..26112e7cd 100644 --- a/core/src/entrypoint/safety_classifier.rs +++ b/crates/core/src/entrypoint/safety_classifier.rs @@ -17,17 +17,14 @@ use rand::seq::SliceRandom; use tracing::info; -use crate::{webpage, Result}; +use crate::Result; use std::path::Path; const TEST_SIZE: f64 = 0.2; -pub fn train>(dataset: P, output: P) -> Result<()> { - if !dataset.as_ref().exists() { - return Err(anyhow::anyhow!( - "dataset path {:?} does not exist", - dataset.as_ref() - )); +pub fn train(dataset: &Path, output: &Path) -> Result<()> { + if !dataset.exists() { + return Err(anyhow::anyhow!("dataset path {:?} does not exist", dataset)); } let mut model = webpage::safety_classifier::Model::new(); @@ -55,7 +52,7 @@ pub fn train>(dataset: P, output: P) -> Result<()> { Ok(()) } -pub fn predict>(model: P, text: &str) -> Result<()> { +pub fn predict(model: &Path, text: &str) -> Result<()> { let model = webpage::safety_classifier::Model::open(model)?; let pred = model.predict_text(text); diff --git a/core/src/entrypoint/search_server.rs b/crates/core/src/entrypoint/search_server.rs similarity index 89% rename from core/src/entrypoint/search_server.rs rename to crates/core/src/entrypoint/search_server.rs index b2a85aea1..2d1e758f5 100644 --- a/core/src/entrypoint/search_server.rs +++ b/crates/core/src/entrypoint/search_server.rs @@ -16,19 +16,18 @@ use std::{collections::HashMap, path::Path}; +use distributed::{ + cluster::Cluster, + member::{Member, Service}, +}; +use entity_index::EntityIndex; +use imager::image_store::Image; use serde::{Deserialize, Serialize}; +use sonic::sonic_service; use tracing::info; use url::Url; use crate::{ - config, - distributed::{ - cluster::Cluster, - member::{Member, Service}, - sonic, - }, - entity_index::EntityIndex, - image_store::Image, index::Index, inverted_index::{self, RetrievedWebpage}, ranking::{ @@ -36,7 +35,7 @@ use crate::{ models::{lambdamart::LambdaMART, linear::LinearRegression}, }, searcher::{InitialWebsiteResult, LocalSearcher, SearchQuery}, - sonic_service, Result, + Result, }; sonic_service!( @@ -58,14 +57,14 @@ pub struct SearchService { } impl SearchService { - async fn new(config: config::SearchServerConfig) -> Result { + async fn new(config: stract_config::SearchServerConfig) -> Result { let entity_index = config .entity_index_path - .map(|path| EntityIndex::open(path).unwrap()); + .map(|path| EntityIndex::open(&path).unwrap()); let centrality_store = config .host_centrality_store_path - .map(|p| InboundSimilarity::open(Path::new(&p).join("inbound_similarity")).unwrap()); - let search_index = Index::open(config.index_path)?; + .map(|p| InboundSimilarity::open(&Path::new(&p).join("inbound_similarity")).unwrap()); + let search_index = Index::open(&config.index_path)?; let mut local_searcher = LocalSearcher::new(search_index); @@ -78,11 +77,11 @@ impl SearchService { } if let Some(model_path) = config.linear_model_path { - local_searcher.set_linear_model(LinearRegression::open(model_path)?); + local_searcher.set_linear_model(LinearRegression::open(&model_path)?); } if let Some(model_path) = config.lambda_model_path { - local_searcher.set_lambda_model(LambdaMART::open(model_path)?); + local_searcher.set_lambda_model(LambdaMART::open(&model_path)?); } if config.build_spell_dictionary { @@ -114,7 +113,7 @@ impl SearchService { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RetrieveWebsites { - pub websites: Vec, + pub websites: Vec, pub query: String, } #[async_trait::async_trait] @@ -192,7 +191,7 @@ impl sonic::service::Message for GetEntityImage { } } -pub async fn run(config: config::SearchServerConfig) -> Result<()> { +pub async fn run(config: stract_config::SearchServerConfig) -> Result<()> { let addr = config.host; let server = SearchService::new(config).await?.bind(addr).await.unwrap(); diff --git a/core/src/entrypoint/webgraph.rs b/crates/core/src/entrypoint/webgraph.rs similarity index 82% rename from core/src/entrypoint/webgraph.rs rename to crates/core/src/entrypoint/webgraph.rs index 69a6f7d51..fcfdf23df 100644 --- a/core/src/entrypoint/webgraph.rs +++ b/crates/core/src/entrypoint/webgraph.rs @@ -13,20 +13,16 @@ // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::{ - config::WarcSource, - config::{self, WebgraphConstructConfig}, - entrypoint::download_all_warc_files, - mapreduce::Worker, - webgraph::{self, Node, WebgraphWriter}, - webpage::Html, - Result, -}; +use crate::{entrypoint::download_all_warc_files, Result}; use itertools::Itertools; +use mapreduce::Worker; use serde::{Deserialize, Serialize}; use std::{fs, path::Path}; +use stract_config::{WarcSource, WebgraphConstructConfig}; use tokio::pin; use tracing::{info, trace}; +use webgraph::{Node, WebgraphWriter}; +use webpage::Html; #[derive(Debug, Serialize, Deserialize, Clone)] struct GraphPointer { @@ -35,26 +31,26 @@ struct GraphPointer { #[derive(Debug, Serialize, Deserialize, Clone)] pub enum JobConfig { - Http(config::HttpConfig), - Local(config::LocalConfig), - S3(config::S3Config), + Http(stract_config::HttpConfig), + Local(stract_config::LocalConfig), + S3(stract_config::S3Config), } -impl From for JobConfig { - fn from(value: config::WarcSource) -> Self { +impl From for JobConfig { + fn from(value: stract_config::WarcSource) -> Self { match value { - config::WarcSource::HTTP(config) => JobConfig::Http(config), - config::WarcSource::Local(config) => JobConfig::Local(config), - config::WarcSource::S3(config) => JobConfig::S3(config), + stract_config::WarcSource::HTTP(config) => JobConfig::Http(config), + stract_config::WarcSource::Local(config) => JobConfig::Local(config), + stract_config::WarcSource::S3(config) => JobConfig::S3(config), } } } -impl From for config::WarcSource { +impl From for stract_config::WarcSource { fn from(value: JobConfig) -> Self { match value { - JobConfig::Http(config) => config::WarcSource::HTTP(config), - JobConfig::Local(config) => config::WarcSource::Local(config), + JobConfig::Http(config) => stract_config::WarcSource::HTTP(config), + JobConfig::Local(config) => stract_config::WarcSource::Local(config), JobConfig::S3(config) => WarcSource::S3(config), } } @@ -66,18 +62,18 @@ pub struct Job { pub warc_paths: Vec, } -pub fn open_host_graph_writer>(path: P) -> webgraph::WebgraphWriter { +pub fn open_host_graph_writer(path: &Path) -> webgraph::WebgraphWriter { WebgraphWriter::new( path, - crate::executor::Executor::single_thread(), + executor::Executor::single_thread(), webgraph::Compression::Lz4, ) } -pub fn open_page_graph_writer>(path: P) -> webgraph::WebgraphWriter { +pub fn open_page_graph_writer(path: &Path) -> webgraph::WebgraphWriter { WebgraphWriter::new( path, - crate::executor::Executor::single_thread(), + executor::Executor::single_thread(), webgraph::Compression::Lz4, ) } @@ -177,8 +173,8 @@ impl Webgraph { let page_path = page_path.join(format!("worker_{i}")); let mut worker = WebgraphWorker { - host_graph: open_host_graph_writer(host_path), - page_graph: open_page_graph_writer(page_path), + host_graph: open_host_graph_writer(&host_path), + page_graph: open_page_graph_writer(&page_path), }; let jobs = jobs.clone(); diff --git a/core/src/entrypoint/webgraph_server.rs b/crates/core/src/entrypoint/webgraph_server.rs similarity index 89% rename from core/src/entrypoint/webgraph_server.rs rename to crates/core/src/entrypoint/webgraph_server.rs index d77072169..88accf6a6 100644 --- a/core/src/entrypoint/webgraph_server.rs +++ b/crates/core/src/entrypoint/webgraph_server.rs @@ -17,28 +17,22 @@ use std::net::SocketAddr; use std::sync::Arc; +use distributed::{ + cluster::Cluster, + member::{Member, Service}, +}; use itertools::Itertools; use serde::Deserialize; use serde::Serialize; +use sonic::{service::Message, sonic_service}; use tracing::info; use url::Url; use utoipa::ToSchema; +use webgraph::{Compression, FullEdge, Node, Webgraph, WebgraphBuilder}; -use crate::config; -use crate::distributed::cluster::Cluster; -use crate::distributed::member::Member; -use crate::distributed::member::Service; -use crate::distributed::sonic; -use crate::distributed::sonic::service::Message; use crate::ranking::inbound_similarity::InboundSimilarity; use crate::searcher::DistributedSearcher; use crate::similar_sites::SimilarSitesFinder; -use crate::sonic_service; -use crate::webgraph::Compression; -use crate::webgraph::FullEdge; -use crate::webgraph::Node; -use crate::webgraph::Webgraph; -use crate::webgraph::WebgraphBuilder; use crate::Result; #[derive(serde::Serialize, serde::Deserialize, ToSchema)] @@ -173,7 +167,7 @@ impl Message for OutgoingLinks { } } -pub async fn run(config: config::WebgraphServerConfig) -> Result<()> { +pub async fn run(config: stract_config::WebgraphServerConfig) -> Result<()> { let addr: SocketAddr = config.host; // dropping the handle leaves the cluster @@ -191,16 +185,16 @@ pub async fn run(config: config::WebgraphServerConfig) -> Result<()> { let searcher = DistributedSearcher::new(cluster); let host_graph = Arc::new( - WebgraphBuilder::new(config.host_graph_path) + WebgraphBuilder::new(&config.host_graph_path) .compression(Compression::Lz4) .open(), ); let page_graph = Arc::new( - WebgraphBuilder::new(config.page_graph_path) + WebgraphBuilder::new(&config.page_graph_path) .compression(Compression::Lz4) .open(), ); - let inbound_similarity = InboundSimilarity::open(config.inbound_similarity_path)?; + let inbound_similarity = InboundSimilarity::open(&config.inbound_similarity_path)?; let similar_sites_finder = SimilarSitesFinder::new( Arc::clone(&host_graph), diff --git a/core/src/feed/mod.rs b/crates/core/src/feed/mod.rs similarity index 94% rename from core/src/feed/mod.rs rename to crates/core/src/feed/mod.rs index e7e69c28f..09a10ba2f 100644 --- a/core/src/feed/mod.rs +++ b/crates/core/src/feed/mod.rs @@ -16,13 +16,13 @@ use std::path::Path; -use crate::tokenizer::{SiteOperatorUrlTokenizer, Tokenizer}; use anyhow::Result; use tantivy::{ query::{PhraseQuery, TermQuery}, schema::{IndexRecordOption, TextFieldIndexing, TextOptions, Value}, tokenizer::Tokenizer as TantivyTokenizer, }; +use tokenizer::{SiteOperatorUrlTokenizer, Tokenizer}; use url::Url; pub mod scheduler; @@ -46,9 +46,9 @@ pub struct FeedIndex { } impl FeedIndex { - pub fn open>(path: P) -> Result { - if !path.as_ref().exists() { - std::fs::create_dir_all(path.as_ref())?; + pub fn open(path: &Path) -> Result { + if !path.exists() { + std::fs::create_dir_all(path)?; } let url_tokenizer = Tokenizer::SiteOperator(SiteOperatorUrlTokenizer); @@ -162,7 +162,6 @@ impl FeedIndex { let url = doc .get_first(self.schema.get_field("url")?) .unwrap() - .as_ref() .as_str() .unwrap(); @@ -171,7 +170,6 @@ impl FeedIndex { let kind = doc .get_first(self.schema.get_field("kind")?) .unwrap() - .as_ref() .as_str() .unwrap(); @@ -190,7 +188,7 @@ mod tests { #[test] fn feed_index() { - let mut index = FeedIndex::open(crate::gen_temp_path()).unwrap(); + let mut index = FeedIndex::open(&stdx::gen_temp_path()).unwrap(); let a = Feed { url: Url::parse("https://a.com/feed.xml").unwrap(), diff --git a/core/src/feed/scheduler.rs b/crates/core/src/feed/scheduler.rs similarity index 89% rename from core/src/feed/scheduler.rs rename to crates/core/src/feed/scheduler.rs index d6fdf5165..ab7569651 100644 --- a/core/src/feed/scheduler.rs +++ b/crates/core/src/feed/scheduler.rs @@ -15,16 +15,13 @@ // along with this program. If not, see . use hashbrown::{HashMap, HashSet}; +use kv::rocksdb_store::RocksDbStore; use url::Url; - -use crate::{ - kv::rocksdb_store::RocksDbStore, - webgraph::{ - centrality::{top_hosts, TopHosts}, - NodeID, Webgraph, - }, - webpage::url_ext::UrlExt, +use webgraph::{ + centrality::{top_hosts, TopHosts}, + NodeID, Webgraph, }; +use webpage::url_ext::UrlExt; use super::{Feed, FeedIndex}; @@ -68,7 +65,7 @@ pub fn schedule( all_feeds .entry(domain) .or_insert(HashSet::new()) - .extend(index.search(&host.name).unwrap().into_iter()); + .extend(index.search(&host.name).unwrap()); } let mut splits = Vec::new(); diff --git a/core/src/human_website_annotations.rs b/crates/core/src/human_website_annotations.rs similarity index 96% rename from core/src/human_website_annotations.rs rename to crates/core/src/human_website_annotations.rs index 6d3264219..fb5da201c 100644 --- a/core/src/human_website_annotations.rs +++ b/crates/core/src/human_website_annotations.rs @@ -73,7 +73,7 @@ impl From> for Mapper { } impl Mapper { - pub fn save>(self, path: P) -> Result<()> { + pub fn save(self, path: &Path) -> Result<()> { let mut file = File::options() .create(true) .truncate(true) @@ -86,7 +86,7 @@ impl Mapper { Ok(()) } - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { let mut reader = BufReader::new(File::open(path)?); let mut bytes = Vec::new(); diff --git a/core/src/improvement.rs b/crates/core/src/improvement.rs similarity index 99% rename from core/src/improvement.rs rename to crates/core/src/improvement.rs index 439299a75..08a9dacc8 100644 --- a/core/src/improvement.rs +++ b/crates/core/src/improvement.rs @@ -18,13 +18,12 @@ use std::{sync::Arc, time::Duration}; use chrono::{DateTime, Timelike, Utc}; use scylla::{prepared_statement::PreparedStatement, SessionBuilder}; +use stdx::leaky_queue::LeakyQueue; use thiserror::Error; use tokio::{sync::Mutex, time}; use url::Url; use uuid::Uuid; -use crate::leaky_queue::LeakyQueue; - #[derive(Debug, Error)] enum Error { #[error("scylla query")] diff --git a/core/src/index.rs b/crates/core/src/index.rs similarity index 89% rename from core/src/index.rs rename to crates/core/src/index.rs index 8da18312e..7f00578e5 100644 --- a/core/src/index.rs +++ b/crates/core/src/index.rs @@ -14,25 +14,27 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use std::collections::HashSet; use std::fs; use std::path::Path; use std::sync::Arc; +use std::{collections::HashSet, path::PathBuf}; use serde::{Deserialize, Serialize}; +use stdx::directory::{self, DirEntry}; use tantivy::schema::Schema; use tantivy::tokenizer::TokenizerManager; use url::Url; +use webgraph::NodeID; +use webpage::{ + region::{Region, RegionCount}, + Webpage, +}; -use crate::collector::MainCollector; -use crate::directory::{self, DirEntry}; use crate::inverted_index::{self, InvertedIndex}; use crate::query::Query; use crate::search_ctx::Ctx; use crate::subdomain_count::SubdomainCounter; -use crate::webgraph::NodeID; -use crate::webpage::region::{Region, RegionCount}; -use crate::webpage::Webpage; +use crate::MainCollector; use crate::Result; const INVERTED_INDEX_SUBFOLDER_NAME: &str = "inverted_index"; @@ -43,27 +45,24 @@ pub struct Index { pub inverted_index: InvertedIndex, pub region_count: RegionCount, pub subdomain_counter: SubdomainCounter, - pub path: String, + pub path: PathBuf, } impl Index { - pub fn open>(path: P) -> Result { - if !path.as_ref().exists() { - fs::create_dir_all(path.as_ref())?; + pub fn open(path: &Path) -> Result { + if !path.exists() { + fs::create_dir_all(path)?; } - let inverted_index = - InvertedIndex::open(path.as_ref().join(INVERTED_INDEX_SUBFOLDER_NAME))?; + let inverted_index = InvertedIndex::open(&path.join(INVERTED_INDEX_SUBFOLDER_NAME))?; - let region_count = RegionCount::open(path.as_ref().join(REGION_COUNT_FILE_NAME)); + let region_count = RegionCount::open(&path.join(REGION_COUNT_FILE_NAME)); Ok(Self { inverted_index, region_count, - subdomain_counter: SubdomainCounter::open( - path.as_ref().join(SUBDOMAIN_COUNT_SUBFOLDER_NAME), - ), - path: path.as_ref().to_str().unwrap().to_string(), + subdomain_counter: SubdomainCounter::open(&path.join(SUBDOMAIN_COUNT_SUBFOLDER_NAME)), + path: path.to_owned(), }) } @@ -79,8 +78,8 @@ impl Index { #[cfg(test)] pub fn temporary() -> Result { - let path = crate::gen_temp_path(); - Self::open(path) + let path = stdx::gen_temp_path(); + Self::open(&path) } pub fn insert(&mut self, webpage: Webpage) -> Result<()> { @@ -122,7 +121,7 @@ impl Index { pub fn retrieve_websites( &self, - websites: &[inverted_index::WebsitePointer], + websites: &[collector::WebsitePointer], query: &Query, ) -> Result> { self.inverted_index.retrieve_websites(websites, query) @@ -136,7 +135,7 @@ impl Index { self.subdomain_counter.merge(other.subdomain_counter); drop(self.subdomain_counter); - Self::open(&self.path).expect("failed to open index") + Self::open(self.path.as_ref()).expect("failed to open index") } pub fn schema(&self) -> Arc { @@ -178,7 +177,7 @@ impl From for Index { } directory::recreate_folder(&frozen.root).unwrap(); - Index::open(path).expect("failed to open index") + Index::open(path.as_ref()).expect("failed to open index") } } @@ -187,7 +186,7 @@ impl From for FrozenIndex { index.commit().expect("failed to commit index"); let path = index.path.clone(); index.inverted_index.stop(); - let root = directory::scan_folder(path).unwrap(); + let root = directory::scan_folder(path.to_str().unwrap().to_string()).unwrap(); Self { root } } diff --git a/core/src/inverted_index.rs b/crates/core/src/inverted_index.rs similarity index 95% rename from core/src/inverted_index.rs rename to crates/core/src/inverted_index.rs index 9c1450c64..1a9d97ebd 100644 --- a/core/src/inverted_index.rs +++ b/crates/core/src/inverted_index.rs @@ -26,38 +26,37 @@ //! This allows us to perform more advanced queries than just term lookups, //! but the principle is the same. use chrono::NaiveDateTime; +use collector::{DocAddress, WebsitePointer}; +use schema::{ + create_schema, fastfield_reader::FastFieldReader, FastField, Field, TextField, ALL_FIELDS, +}; use serde::{Deserialize, Serialize}; +use stract_config::SnippetConfig; +use stract_query::shortcircuit::ShortCircuitQuery; use tantivy::collector::Count; use tantivy::directory::MmapDirectory; use tantivy::merge_policy::NoMergePolicy; use tantivy::schema::{Schema, Value}; use tantivy::tokenizer::TokenizerManager; use tantivy::{IndexReader, IndexWriter, SegmentMeta, TantivyDocument}; +use tokenizer::{ + BigramTokenizer, Identity, JsonField, SiteOperatorUrlTokenizer, Tokenizer, TrigramTokenizer, +}; use url::Url; +use webgraph::NodeID; +use webpage::{region::Region, schema_org, Webpage}; -use crate::collector::{Hashes, MainCollector}; -use crate::config::SnippetConfig; -use crate::fastfield_reader::FastFieldReader; -use crate::query::shortcircuit::ShortCircuitQuery; use crate::query::Query; -use crate::ranking::initial::Score; use crate::ranking::pipeline::RankingWebsite; use crate::ranking::SignalAggregator; -use crate::schema::{FastField, Field, TextField, ALL_FIELDS}; use crate::search_ctx::Ctx; +use crate::snippet; use crate::snippet::TextSnippet; -use crate::tokenizer::{ - BigramTokenizer, Identity, JsonField, SiteOperatorUrlTokenizer, TrigramTokenizer, -}; -use crate::webgraph::NodeID; -use crate::webpage::region::Region; -use crate::webpage::{schema_org, Webpage}; +use crate::MainCollector; use crate::Result; -use crate::{combine_u64s, snippet}; -use crate::{schema::create_schema, tokenizer::Tokenizer}; use std::collections::HashSet; use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::Arc; #[derive(Debug, Serialize, Deserialize)] @@ -66,44 +65,13 @@ pub struct InitialSearchResult { pub top_websites: Vec, } -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] -pub struct WebsitePointer { - pub score: Score, - pub hashes: Hashes, - pub address: DocAddress, -} - -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)] -pub struct DocAddress { - pub segment: u32, - pub doc_id: u32, -} - -impl From for DocAddress { - fn from(address: tantivy::DocAddress) -> Self { - Self { - segment: address.segment_ord, - doc_id: address.doc_id, - } - } -} - -impl From for tantivy::DocAddress { - fn from(address: DocAddress) -> Self { - Self { - segment_ord: address.segment, - doc_id: address.doc_id, - } - } -} - struct SegmentMergeCandidate { num_docs: u32, segments: Vec, } pub struct InvertedIndex { - pub path: String, + pub path: PathBuf, tantivy_index: tantivy::Index, writer: IndexWriter, reader: IndexReader, @@ -112,11 +80,11 @@ pub struct InvertedIndex { } impl InvertedIndex { - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { let schema = create_schema(); - let tantivy_index = if path.as_ref().exists() { - let mmap_directory = MmapDirectory::open(&path)?; + let tantivy_index = if path.exists() { + let mmap_directory = MmapDirectory::open(path)?; tantivy::Index::open(mmap_directory)? } else { let index_settings = tantivy::IndexSettings { @@ -127,8 +95,8 @@ impl InvertedIndex { ..Default::default() }; - fs::create_dir_all(&path)?; - let mmap_directory = MmapDirectory::open(&path)?; + fs::create_dir_all(path)?; + let mmap_directory = MmapDirectory::open(path)?; tantivy::Index::create(mmap_directory, schema.clone(), index_settings)? }; @@ -178,7 +146,7 @@ impl InvertedIndex { writer, reader, schema: Arc::new(schema), - path: path.as_ref().to_str().unwrap().to_string(), + path: path.to_owned(), tantivy_index, snippet_config: SnippetConfig::default(), }) @@ -198,8 +166,8 @@ impl InvertedIndex { #[cfg(test)] pub fn temporary() -> Result { - let path = crate::gen_temp_path(); - Self::open(path) + let path = stdx::gen_temp_path(); + Self::open(&path) } pub fn insert(&mut self, webpage: Webpage) -> Result<()> { @@ -320,7 +288,7 @@ impl InvertedIndex { if id1 == u64::MAX && id2 == u64::MAX { Ok(None) } else { - let id = combine_u64s([id1, id2]); + let id = stdx::combine_u64s([id1, id2]); Ok(Some(id.into())) } } @@ -339,11 +307,15 @@ impl InvertedIndex { for page in &mut webpages { if !page.body.is_empty() { - page.snippet = - snippet::generate(query, &page.body, &page.region, self.snippet_config.clone()); + page.snippet = snippet::generate( + query.simple_terms().iter().map(|s| s.as_str()), + &page.body, + &page.region, + self.snippet_config.clone(), + ); } else { page.snippet = snippet::generate( - query, + query.simple_terms().iter().map(|s| s.as_str()), page.description.as_deref().unwrap_or_default(), &page.region, self.snippet_config.clone(), @@ -436,11 +408,11 @@ impl InvertedIndex { .expect("failed to load tantivy metadata for index"); let x = other.path.clone(); - let other_path = Path::new(x.as_str()); + let other_path = Path::new(&x); other.writer.wait_merging_threads().unwrap(); let path = self.path.clone(); - let self_path = Path::new(path.as_str()); + let self_path = Path::new(&path); self.writer.wait_merging_threads().unwrap(); let ids: HashSet<_> = meta.segments.iter().map(|segment| segment.id()).collect(); @@ -474,7 +446,7 @@ impl InvertedIndex { .unwrap(); } - Self::open(path).expect("failed to open index") + Self::open(&path).expect("failed to open index") } pub fn stop(self) { @@ -655,11 +627,11 @@ impl From for RetrievedWebpage { #[cfg(test)] mod tests { use maplit::hashmap; + use webpage::Html; use crate::{ ranking::{Ranker, SignalAggregator}, searcher::SearchQuery, - webpage::Html, }; use super::*; diff --git a/core/src/lib.rs b/crates/core/src/lib.rs similarity index 53% rename from core/src/lib.rs rename to crates/core/src/lib.rs index 7069c4ca3..9d9b818f3 100644 --- a/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -23,72 +23,32 @@ #![allow(clippy::cast_precision_loss)] #![allow(clippy::missing_errors_doc)] -use std::path::PathBuf; -use thiserror::Error; - pub mod entrypoint; mod inverted_index; -pub mod mapreduce; - -#[cfg(feature = "with_alice")] -pub mod alice; mod api; mod autosuggest; -mod bangs; -mod bloom; -mod collector; -pub mod config; -pub mod crawler; -mod directory; -mod distributed; -mod entity_index; -mod enum_map; -mod executor; -mod fastfield_reader; pub mod feed; mod human_website_annotations; -pub mod hyperloglog; -mod image_downloader; -mod image_store; mod improvement; pub mod index; -mod intmap; -mod kahan_sum; -mod kv; -mod leaky_queue; -#[cfg(feature = "libtorch")] -mod llm_utils; mod metrics; -pub mod naive_bayes; -pub mod prehashed; -#[cfg(feature = "libtorch")] -mod qa_model; mod query; pub mod ranking; -mod schema; mod search_ctx; mod search_prettifier; pub mod searcher; -mod simhash; pub mod similar_sites; mod snippet; -pub mod spell; mod subdomain_count; -#[cfg(feature = "libtorch")] -pub mod summarizer; -mod tokenizer; #[allow(unused)] mod ttl_cache; -pub mod warc; -pub mod webgraph; -pub mod webpage; mod widgets; -#[derive(Error, Debug)] +#[derive(thiserror::Error, Debug)] pub enum Error { - #[error("Failed to parse WARC file")] - WarcParse(&'static str), + #[error("WARC error")] + Warc(#[from] warc::Error), #[error("Encountered an empty required field ({0}) when converting to tantivy")] EmptyField(&'static str), @@ -120,32 +80,7 @@ pub enum Error { pub(crate) type Result = std::result::Result; -// taken from https://docs.rs/sled/0.34.7/src/sled/config.rs.html#445 -pub fn gen_temp_path() -> PathBuf { - use std::sync::atomic::{AtomicUsize, Ordering}; - use std::time::SystemTime; - - static SALT_COUNTER: AtomicUsize = AtomicUsize::new(0); - - let seed = SALT_COUNTER.fetch_add(1, Ordering::SeqCst) as u128; - - let now = SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap() - .as_nanos() - << 48; - - let pid = u128::from(std::process::id()); - - let salt = (pid << 16) + now + seed; - - if cfg!(target_os = "linux") { - // use shared memory for temporary linux files - format!("/dev/shm/pagecache.tmp.{salt}").into() - } else { - std::env::temp_dir().join(format!("pagecache.tmp.{salt}")) - } -} +pub type MainCollector = collector::TweakedScoreTopCollector; #[cfg(test)] fn rand_words(num_words: usize) -> String { @@ -166,43 +101,3 @@ fn rand_words(num_words: usize) -> String { res.trim().to_string() } - -fn ceil_char_boundary(str: &str, index: usize) -> usize { - let mut res = index; - - while !str.is_char_boundary(res) && res < str.len() { - res += 1; - } - - res -} - -fn floor_char_boundary(str: &str, index: usize) -> usize { - let mut res = index; - - while !str.is_char_boundary(res) && res > 0 { - res -= 1; - } - - res -} - -pub fn split_u128(num: u128) -> [u64; 2] { - [(num >> 64) as u64, num as u64] -} - -pub fn combine_u64s(nums: [u64; 2]) -> u128 { - ((nums[0] as u128) << 64) | (nums[1] as u128) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn split_combine_u128() { - for num in 0..10000_u128 { - assert_eq!(combine_u64s(split_u128(num)), num); - } - } -} diff --git a/core/src/metrics.rs b/crates/core/src/metrics.rs similarity index 100% rename from core/src/metrics.rs rename to crates/core/src/metrics.rs diff --git a/core/src/query/mod.rs b/crates/core/src/query.rs similarity index 98% rename from core/src/query/mod.rs rename to crates/core/src/query.rs index c1066cc61..1eeea5c1a 100644 --- a/core/src/query/mod.rs +++ b/crates/core/src/query.rs @@ -15,30 +15,18 @@ // along with this program. If not, see . use crate::{ - inverted_index::InvertedIndex, - query::parser::TermCompound, - ranking::SignalCoefficient, - schema::{Field, TextField}, - search_ctx::Ctx, - searcher::SearchQuery, - webpage::{region::Region, safety_classifier}, - Result, + inverted_index::InvertedIndex, ranking::SignalCoefficient, search_ctx::Ctx, + searcher::SearchQuery, Result, }; use optics::{Optic, SiteRankings}; +use schema::{Field, TextField}; use std::collections::HashMap; +use stract_query::{ + optic::AsMultipleTantivyQuery, + parser::{CompoundAwareTerm, Term, TermCompound}, +}; use tantivy::query::{BooleanQuery, Occur, QueryClone, TermQuery}; - -mod const_query; -pub mod intersection; -pub mod optic; -pub mod parser; -mod pattern_query; -pub mod shortcircuit; -pub mod union; - -use parser::Term; - -use self::{optic::AsMultipleTantivyQuery, parser::CompoundAwareTerm}; +use webpage::{region::Region, safety_classifier}; const MAX_SIMILAR_TERMS: usize = 10; @@ -58,7 +46,7 @@ pub struct Query { impl Query { pub fn parse(ctx: &Ctx, query: &SearchQuery, index: &InvertedIndex) -> Result { - let parsed_terms = parser::parse(&query.query); + let parsed_terms = stract_query::parser::parse(&query.query); let mut term_count = HashMap::new(); let mut terms = Vec::new(); @@ -243,8 +231,8 @@ mod tests { index::Index, rand_words, searcher::{LocalSearcher, SearchQuery}, - webpage::Webpage, }; + use webpage::Webpage; use super::*; diff --git a/core/src/ranking/bitvec_similarity.rs b/crates/core/src/ranking/bitvec_similarity.rs similarity index 99% rename from core/src/ranking/bitvec_similarity.rs rename to crates/core/src/ranking/bitvec_similarity.rs index f859db602..eb2467356 100644 --- a/core/src/ranking/bitvec_similarity.rs +++ b/crates/core/src/ranking/bitvec_similarity.rs @@ -17,7 +17,7 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; -use crate::hyperloglog::HyperLogLog; +use hyperloglog::HyperLogLog; const THRESHOLD_SIM_ESTIMATE: f64 = 0.1; diff --git a/core/src/ranking/inbound_similarity.rs b/crates/core/src/ranking/inbound_similarity.rs similarity index 94% rename from core/src/ranking/inbound_similarity.rs rename to crates/core/src/ranking/inbound_similarity.rs index dd49abeaa..bd0d82289 100644 --- a/core/src/ranking/inbound_similarity.rs +++ b/crates/core/src/ranking/inbound_similarity.rs @@ -24,12 +24,10 @@ use std::{ use dashmap::DashMap; use rayon::prelude::ParallelIterator; use serde::{Deserialize, Serialize}; +use stdx::intmap::{IntMap, IntSet}; +use webgraph::{NodeID, Webgraph}; -use crate::{ - intmap::{IntMap, IntSet}, - webgraph::{NodeID, Webgraph}, - Result, -}; +use crate::Result; use super::bitvec_similarity; @@ -179,7 +177,7 @@ impl InboundSimilarity { } } - pub fn save>(&self, path: P) -> Result<()> { + pub fn save(&self, path: &Path) -> Result<()> { let mut file = BufWriter::new( File::options() .create(true) @@ -197,7 +195,7 @@ impl InboundSimilarity { self.vectors.get(node) } - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { let file = File::open(path)?; let mut reader = BufReader::new(file); @@ -215,14 +213,13 @@ impl InboundSimilarity { #[cfg(test)] mod tests { use optics::SiteRankings; + use webgraph::{Node, WebgraphWriter}; + use webpage::{Html, Webpage}; use crate::{ - gen_temp_path, index::Index, rand_words, searcher::{LocalSearcher, SearchQuery}, - webgraph::{Node, WebgraphWriter}, - webpage::{Html, Webpage}, }; use super::*; @@ -230,9 +227,9 @@ mod tests { #[test] fn it_favors_liked_sites() { let mut wrt = WebgraphWriter::new( - gen_temp_path(), - crate::executor::Executor::single_thread(), - crate::webgraph::Compression::default(), + &stdx::gen_temp_path(), + executor::Executor::single_thread(), + webgraph::Compression::default(), ); wrt.insert(Node::from("a.com"), Node::from("b.com"), String::new()); @@ -259,9 +256,9 @@ mod tests { #[test] fn it_ranks_search_results() { let mut wrt = WebgraphWriter::new( - crate::gen_temp_path(), - crate::executor::Executor::single_thread(), - crate::webgraph::Compression::default(), + &stdx::gen_temp_path(), + executor::Executor::single_thread(), + webgraph::Compression::default(), ); wrt.insert(Node::from("b.com"), Node::from("a.com"), String::new()); diff --git a/core/src/ranking/initial.rs b/crates/core/src/ranking/initial.rs similarity index 93% rename from core/src/ranking/initial.rs rename to crates/core/src/ranking/initial.rs index beece4c0f..d16597d3d 100644 --- a/core/src/ranking/initial.rs +++ b/crates/core/src/ranking/initial.rs @@ -14,9 +14,9 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::fastfield_reader::FastFieldReader; use chrono::Utc; -use serde::{Deserialize, Serialize}; +use collector::Score; +use schema::fastfield_reader::FastFieldReader; use tantivy::collector::{ScoreSegmentTweaker, ScoreTweaker}; use tantivy::{DocId, SegmentReader}; @@ -69,11 +69,6 @@ pub struct InitialSegmentScoreTweaker { aggregator: SignalAggregator, } -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] -pub struct Score { - pub total: f64, -} - impl ScoreSegmentTweaker for InitialSegmentScoreTweaker { fn score(&mut self, doc: DocId, _score: tantivy::Score) -> Score { let mut total = self diff --git a/core/src/ranking/mod.rs b/crates/core/src/ranking/mod.rs similarity index 99% rename from core/src/ranking/mod.rs rename to crates/core/src/ranking/mod.rs index 01a9a5dfe..c907e974b 100644 --- a/core/src/ranking/mod.rs +++ b/crates/core/src/ranking/mod.rs @@ -15,7 +15,6 @@ // along with this program. If not, see . mod bitvec_similarity; -pub mod bm25; pub mod inbound_similarity; pub mod initial; pub mod models; @@ -25,14 +24,15 @@ pub mod query_centrality; pub mod signal; use initial::InitialScoreTweaker; +use schema::fastfield_reader::FastFieldReader; +use stract_config::CollectorConfig; +use webpage::region::Region; +use collector::{MaxDocsConsidered, TopDocs}; use crate::{ - collector::{MainCollector, MaxDocsConsidered, TopDocs}, - config::CollectorConfig, - fastfield_reader::FastFieldReader, search_ctx::Ctx, searcher::NUM_RESULTS_PER_PAGE, - webpage::region::Region, + MainCollector, }; pub use self::signal::*; @@ -133,11 +133,11 @@ impl Ranker { mod tests { use optics::Optic; + use webpage::{Html, Webpage}; use crate::{ index::Index, searcher::{LocalSearcher, SearchQuery}, - webpage::{Html, Webpage}, }; const CONTENT: &str = "this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever"; @@ -441,7 +441,7 @@ mod tests { fetch_time_ms: 500, page_centrality: 0.0, pre_computed_score: 0.0, - + node_id: None, dmoz_description: None, safety_classification: None, diff --git a/core/src/ranking/models/cross_encoder.rs b/crates/core/src/ranking/models/cross_encoder.rs similarity index 93% rename from core/src/ranking/models/cross_encoder.rs rename to crates/core/src/ranking/models/cross_encoder.rs index 46402c29e..fd3e17212 100644 --- a/core/src/ranking/models/cross_encoder.rs +++ b/crates/core/src/ranking/models/cross_encoder.rs @@ -38,7 +38,7 @@ mod model { } impl CrossEncoderModel { - pub fn open>(folder: P) -> Result { + pub fn open(folder: &Path) -> Result { let truncation = TruncationParams { max_length: TRUNCATE_INPUT, ..Default::default() @@ -48,16 +48,15 @@ mod model { ..Default::default() }; - let mut tokenizer = - tokenizers::Tokenizer::from_file(folder.as_ref().join("tokenizer.json")) - .map_err(|_| anyhow!("couldn't open tokenizer"))?; + let mut tokenizer = tokenizers::Tokenizer::from_file(folder.join("tokenizer.json")) + .map_err(|_| anyhow!("couldn't open tokenizer"))?; tokenizer .with_truncation(Some(truncation)) .map_err(|_| anyhow!("tokenizer truncation settings"))?; tokenizer.with_padding(Some(padding)); - let model = tch::CModule::load(folder.as_ref().join("model.pt"))?; + let model = tch::CModule::load(folder.join("model.pt"))?; Ok(Self { tokenizer, model }) } @@ -159,7 +158,7 @@ mod tests { #[test] fn sanity_check() { - let model = CrossEncoderModel::open("../data/cross_encoder") + let model = CrossEncoderModel::open("../../data/cross_encoder".as_ref()) .expect("Failed to find cross-encoder model"); let s = model.run( diff --git a/core/src/ranking/models/lambdamart.rs b/crates/core/src/ranking/models/lambdamart.rs similarity index 98% rename from core/src/ranking/models/lambdamart.rs rename to crates/core/src/ranking/models/lambdamart.rs index 2a8019ae5..b4f0f75aa 100644 --- a/core/src/ranking/models/lambdamart.rs +++ b/crates/core/src/ranking/models/lambdamart.rs @@ -15,11 +15,9 @@ // along with this program. If not, see . use std::{path::Path, str::FromStr}; +use stdx::enum_map::EnumMap; -use crate::{ - enum_map::EnumMap, - ranking::{signal, Signal}, -}; +use crate::ranking::{signal, Signal}; type Result = std::result::Result; @@ -259,7 +257,7 @@ pub struct LambdaMART { } impl LambdaMART { - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { let s = std::fs::read_to_string(path)?; let tree = Tree::parse(&s)?; Ok(Self { tree }) diff --git a/core/src/ranking/models/linear.rs b/crates/core/src/ranking/models/linear.rs similarity index 94% rename from core/src/ranking/models/linear.rs rename to crates/core/src/ranking/models/linear.rs index 81ed458e9..9d6c4f48e 100644 --- a/core/src/ranking/models/linear.rs +++ b/crates/core/src/ranking/models/linear.rs @@ -17,8 +17,8 @@ use std::fs::File; use std::io::BufReader; use std::{collections::HashMap, path::Path}; +use stdx::enum_map::EnumMap; -use crate::enum_map::EnumMap; use crate::ranking::Signal; use crate::Result; @@ -45,7 +45,7 @@ pub struct LinearRegression { } impl LinearRegression { - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { let file = File::open(path)?; let reader = BufReader::new(file); let model: SerialziedLinearRegression = serde_json::from_reader(reader)?; diff --git a/core/src/ranking/models/mod.rs b/crates/core/src/ranking/models/mod.rs similarity index 100% rename from core/src/ranking/models/mod.rs rename to crates/core/src/ranking/models/mod.rs diff --git a/core/src/ranking/optics.rs b/crates/core/src/ranking/optics.rs similarity index 97% rename from core/src/ranking/optics.rs rename to crates/core/src/ranking/optics.rs index e9f18f76b..d4fa681b9 100644 --- a/core/src/ranking/optics.rs +++ b/crates/core/src/ranking/optics.rs @@ -17,14 +17,13 @@ #[cfg(test)] mod tests { use optics::SiteRankings; + use webgraph::{Node, WebgraphWriter}; + use webpage::{Html, Webpage}; use crate::{ - gen_temp_path, index::Index, ranking::inbound_similarity::InboundSimilarity, searcher::{LocalSearcher, SearchQuery}, - webgraph::{Node, WebgraphWriter}, - webpage::{Html, Webpage}, }; const CONTENT: &str = "this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever"; @@ -33,9 +32,9 @@ mod tests { let mut index = Index::temporary().expect("Unable to open index"); let mut wrt = WebgraphWriter::new( - gen_temp_path(), - crate::executor::Executor::single_thread(), - crate::webgraph::Compression::default(), + &stdx::gen_temp_path(), + executor::Executor::single_thread(), + webgraph::Compression::default(), ); wrt.insert( diff --git a/core/src/ranking/pipeline.rs b/crates/core/src/ranking/pipeline.rs similarity index 96% rename from core/src/ranking/pipeline.rs rename to crates/core/src/ranking/pipeline.rs index e952bb1bc..92a0716b6 100644 --- a/core/src/ranking/pipeline.rs +++ b/crates/core/src/ranking/pipeline.rs @@ -16,16 +16,12 @@ use std::sync::Arc; +use collector::{BucketCollector, WebsitePointer}; use serde::{Deserialize, Serialize}; +use stdx::enum_map::EnumMap; +use stract_config::CollectorConfig; -use crate::{ - collector::{self, BucketCollector}, - config::CollectorConfig, - enum_map::EnumMap, - inverted_index::WebsitePointer, - searcher::SearchQuery, - Result, -}; +use crate::{searcher::SearchQuery, Result}; use super::{ models::lambdamart::{self, LambdaMART}, @@ -34,25 +30,22 @@ use super::{ use super::models::cross_encoder::CrossEncoder; -pub trait AsRankingWebsite: Clone { +pub trait AsRankingWebsite: Clone + collector::Doc { fn as_ranking(&self) -> &RankingWebsite; fn as_mut_ranking(&mut self) -> &mut RankingWebsite; } -impl collector::Doc for T -where - T: AsRankingWebsite, -{ +impl collector::Doc for RankingWebsite { fn score(&self) -> f64 { - self.as_ranking().score + self.score } fn id(&self) -> &tantivy::DocId { - &self.as_ranking().pointer.address.doc_id + &self.pointer.address.doc_id } fn hashes(&self) -> collector::Hashes { - self.as_ranking().pointer.hashes + self.pointer.hashes } } @@ -393,13 +386,11 @@ impl RankingPipeline { #[cfg(test)] mod tests { + use collector::{DocAddress, Hashes, Score}; use itertools::Itertools; + use stdx::prehashed::Prehashed; use crate::ranking::models::cross_encoder::DummyCrossEncoder; - use crate::{ - collector::Hashes, inverted_index::DocAddress, prehashed::Prehashed, - ranking::initial::Score, - }; use super::*; diff --git a/core/src/ranking/query_centrality.rs b/crates/core/src/ranking/query_centrality.rs similarity index 97% rename from core/src/ranking/query_centrality.rs rename to crates/core/src/ranking/query_centrality.rs index 1551b826b..a7fa381ee 100644 --- a/core/src/ranking/query_centrality.rs +++ b/crates/core/src/ranking/query_centrality.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::webgraph::NodeID; +use webgraph::NodeID; use super::inbound_similarity; diff --git a/core/src/ranking/signal.rs b/crates/core/src/ranking/signal.rs similarity index 98% rename from core/src/ranking/signal.rs rename to crates/core/src/ranking/signal.rs index 047e5b447..f51df9352 100644 --- a/core/src/ranking/signal.rs +++ b/crates/core/src/ranking/signal.rs @@ -14,39 +14,34 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::fastfield_reader::FieldValue; -use crate::query::optic::AsSearchableRule; -use crate::query::Query; -use crate::{combine_u64s, Result}; -use crate::{ - enum_map::EnumMap, - fastfield_reader, - schema::{FastField, TextField}, - webgraph::NodeID, - webpage::Webpage, -}; +use crate::{query::Query, Result}; use optics::ast::RankingTarget; use optics::Optic; +use schema::{ + fastfield_reader::{self, FieldValue}, + FastField, TextField, FLOAT_SCALING, +}; use serde::{Deserialize, Serialize}; use std::cell::RefCell; use std::str::FromStr; use std::sync::Arc; +use stdx::enum_map::EnumMap; +use stract_query::{bm25::Bm25Weight, optic::AsSearchableRule}; use tantivy::fieldnorm::FieldNormReader; use tantivy::postings::SegmentPostings; use tantivy::query::{Query as _, Scorer}; use tantivy::tokenizer::Tokenizer; use thiserror::Error; use utoipa::ToSchema; +use webgraph::NodeID; +use webpage::{ + region::{Region, RegionCount}, + Webpage, +}; use tantivy::DocSet; use tantivy::{DocId, Postings}; -use crate::{ - schema::FLOAT_SCALING, - webpage::region::{Region, RegionCount}, -}; - -use super::bm25::Bm25Weight; use super::models::linear::LinearRegression; use super::{inbound_similarity, query_centrality}; @@ -294,7 +289,7 @@ impl Signal { if node_id1 == u64::MAX && node_id2 == u64::MAX { None } else { - let id = combine_u64s([node_id1, node_id2]); + let id = stdx::combine_u64s([node_id1, node_id2]); Some(NodeID::from(id)) } diff --git a/core/src/search_ctx.rs b/crates/core/src/search_ctx.rs similarity index 94% rename from core/src/search_ctx.rs rename to crates/core/src/search_ctx.rs index 2bb3af570..12e289e4c 100644 --- a/core/src/search_ctx.rs +++ b/crates/core/src/search_ctx.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::fastfield_reader::FastFieldReader; +use schema::fastfield_reader::FastFieldReader; #[derive(Clone)] pub struct Ctx { diff --git a/core/src/search_prettifier/entity.rs b/crates/core/src/search_prettifier/entity.rs similarity index 75% rename from core/src/search_prettifier/entity.rs rename to crates/core/src/search_prettifier/entity.rs index 1d0e310cb..7bec4ad53 100644 --- a/core/src/search_prettifier/entity.rs +++ b/crates/core/src/search_prettifier/entity.rs @@ -19,7 +19,7 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; -use crate::entity_index::{ +use entity_index::{ entity::{EntitySnippet, Span}, EntityMatch, }; @@ -100,48 +100,8 @@ fn maybe_prettify_entity_date(value: &str) -> Option { #[cfg(test)] mod tests { - use crate::entity_index::entity::Link; - use super::*; - #[test] - fn simple_link_to_html() { - assert_eq!( - EntitySnippet::from_span( - &Span { - text: "some text with a link".to_string(), - links: vec![Link { - start: 5, - end: 9, - target: "text article".to_string() - }] - }, - 10000 - ) - .to_md(None), - "some [text](https://en.wikipedia.org/wiki/text_article) with a link".to_string() - ); - } - - #[test] - fn truncated_link_to_html() { - assert_eq!( - EntitySnippet::from_span( - &Span { - text: "some text".to_string(), - links: vec![Link { - start: 5, - end: 9, - target: "text article".to_string() - }] - }, - 7 - ) - .to_md(None), - "some [te](https://en.wikipedia.org/wiki/text_article)...".to_string() - ); - } - #[test] fn einstein_date() { assert_eq!( diff --git a/core/src/search_prettifier/mod.rs b/crates/core/src/search_prettifier/mod.rs similarity index 99% rename from core/src/search_prettifier/mod.rs rename to crates/core/src/search_prettifier/mod.rs index d238da155..bb4c76433 100644 --- a/core/src/search_prettifier/mod.rs +++ b/crates/core/src/search_prettifier/mod.rs @@ -21,15 +21,15 @@ use std::collections::HashMap; use chrono::{NaiveDateTime, Utc}; use serde::{Deserialize, Serialize}; +use spell::CorrectionTerm; use url::Url; use utoipa::ToSchema; +use webpage::url_ext::UrlExt; use crate::{ inverted_index::RetrievedWebpage, ranking::{Signal, SignalScore}, snippet::TextSnippet, - spell::{self, CorrectionTerm}, - webpage::url_ext::UrlExt, }; pub use self::stack_overflow::{create_stackoverflow_sidebar, CodeOrText}; diff --git a/core/src/search_prettifier/stack_overflow.rs b/crates/core/src/search_prettifier/stack_overflow.rs similarity index 98% rename from core/src/search_prettifier/stack_overflow.rs rename to crates/core/src/search_prettifier/stack_overflow.rs index da9e59f0b..9caa18524 100644 --- a/core/src/search_prettifier/stack_overflow.rs +++ b/crates/core/src/search_prettifier/stack_overflow.rs @@ -17,12 +17,9 @@ use serde::{Deserialize, Serialize}; use url::Url; use utoipa::ToSchema; +use webpage::schema_org::{self, Item, OneOrMany, Property}; -use crate::{ - inverted_index::RetrievedWebpage, - webpage::schema_org::{self, Item, OneOrMany, Property}, - Error, -}; +use crate::{inverted_index::RetrievedWebpage, Error}; use super::{DisplayedSidebar, Snippet}; use crate::Result; diff --git a/core/src/searcher/api.rs b/crates/core/src/searcher/api.rs similarity index 94% rename from core/src/searcher/api.rs rename to crates/core/src/searcher/api.rs index 948d7b508..9351b4440 100644 --- a/core/src/searcher/api.rs +++ b/crates/core/src/searcher/api.rs @@ -14,20 +14,16 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use collector::BucketCollector; use std::cmp::Ordering; use std::collections::HashMap; use std::ops::Range; use std::sync::Arc; use std::time::Instant; -use itertools::{intersperse, Itertools}; -use optics::Optic; -use url::Url; - -use crate::bangs::{Bang, BangHit}; -use crate::config::{ApiConfig, ApiThresholds, CollectorConfig}; -use crate::image_store::Image; use crate::inverted_index::RetrievedWebpage; +#[cfg(feature = "libtorch")] +use crate::ranking::models::cross_encoder::CrossEncoderModel; #[cfg(not(feature = "libtorch"))] use crate::ranking::models::cross_encoder::DummyCrossEncoder; use crate::ranking::ALL_SIGNALS; @@ -37,14 +33,18 @@ use crate::search_prettifier::{ }; use crate::widgets::{Widget, Widgets}; use crate::{ - bangs::Bangs, - collector::BucketCollector, - distributed::cluster::Cluster, ranking::{models::lambdamart::LambdaMART, pipeline::RankingPipeline}, + Result, }; -use crate::{ceil_char_boundary, floor_char_boundary, query, Result}; +use ::distributed::cluster::Cluster; +use imager::image_store::Image; +use itertools::{intersperse, Itertools}; +use optics::Optic; +use stract_config::{ApiConfig, ApiThresholds, CollectorConfig}; #[cfg(feature = "libtorch")] -use crate::{qa_model::QaModel, ranking::models::cross_encoder::CrossEncoderModel}; +use stract_llm::qa_model::QaModel; +use stract_query::bangs::{Bang, BangHit, Bangs}; +use url::Url; use super::{ distributed, DistributedSearcher, InitialSearchResultShard, ScoredWebsitePointer, SearchQuery, @@ -215,16 +215,18 @@ impl ApiSearcher { } async fn check_bangs(&self, query: &SearchQuery) -> Result> { - let parsed_terms = query::parser::parse(&query.query); + let parsed_terms = stract_query::parser::parse(&query.query); if parsed_terms.iter().any(|term| match term.as_ref() { - query::parser::Term::PossibleBang(t) => t.is_empty(), + stract_query::parser::Term::PossibleBang(t) => t.is_empty(), _ => false, }) { let q: String = intersperse( parsed_terms .iter() - .filter(|term| !matches!(term.as_ref(), query::parser::Term::PossibleBang(_))) + .filter(|term| { + !matches!(term.as_ref(), stract_query::parser::Term::PossibleBang(_)) + }) .map(|term| term.to_string()), " ".to_string(), ) @@ -281,7 +283,7 @@ impl ApiSearcher { #[cfg(not(feature = "libtorch"))] let pipeline: RankingPipeline = - RankingPipeline::reranking_for_query::( + RankingPipeline::reranking_for_stract_query::( &mut query, None, self.lambda_model.clone(), @@ -371,7 +373,7 @@ impl ApiSearcher { #[cfg(not(feature = "libtorch"))] let pipeline: RankingPipeline = - RankingPipeline::reranking_for_query::( + RankingPipeline::reranking_for_stract_query::( &mut search_query, None, self.lambda_model.clone(), @@ -452,13 +454,13 @@ impl ApiSearcher { return None; } - let parsed_terms = query::parser::parse(&query.query); + let parsed_terms = stract_query::parser::parse(&query.query); self.widgets.widget( parsed_terms .into_iter() .filter_map(|term| { - if let query::parser::Term::Simple(simple) = *term { + if let stract_query::parser::Term::Simple(simple) = *term { Some(String::from(simple)) } else { None @@ -542,24 +544,24 @@ fn generate_answer_snippet(body: &str, answer_offset: Range) -> String { if (answer_offset.end - best_start > SNIPPET_LENGTH) || (best_start >= best_end) { if answer_offset.end - answer_offset.start >= SNIPPET_LENGTH { - let end = floor_char_boundary(body, answer_offset.start + SNIPPET_LENGTH); + let end = stdx::floor_char_boundary(body, answer_offset.start + SNIPPET_LENGTH); return "".to_string() + &body[answer_offset.start..end] + ""; } let chars_either_side = (SNIPPET_LENGTH - (answer_offset.end - answer_offset.start)) / 2; - let start = ceil_char_boundary( + let start = stdx::ceil_char_boundary( body, answer_offset .start .checked_sub(chars_either_side) .unwrap_or_default(), ); - let mut end = ceil_char_boundary(body, answer_offset.end + chars_either_side); + let mut end = stdx::ceil_char_boundary(body, answer_offset.end + chars_either_side); if end >= body.len() { - end = floor_char_boundary(body, body.len()); + end = stdx::floor_char_boundary(body, body.len()); } body[start..answer_offset.start].to_string() @@ -574,7 +576,8 @@ fn generate_answer_snippet(body: &str, answer_offset: Range) -> String { + ""; let remaining_chars = SNIPPET_LENGTH - (res.len() - 7); - let end = ceil_char_boundary(body, (remaining_chars + answer_offset.end).min(best_end)); + let end = + stdx::ceil_char_boundary(body, (remaining_chars + answer_offset.end).min(best_end)); res += &body[answer_offset.end..end]; diff --git a/core/src/searcher/discussions.optic b/crates/core/src/searcher/discussions.optic similarity index 100% rename from core/src/searcher/discussions.optic rename to crates/core/src/searcher/discussions.optic diff --git a/core/src/searcher/distributed.rs b/crates/core/src/searcher/distributed.rs similarity index 94% rename from core/src/searcher/distributed.rs rename to crates/core/src/searcher/distributed.rs index 6199f07aa..03bcf1143 100644 --- a/core/src/searcher/distributed.rs +++ b/crates/core/src/searcher/distributed.rs @@ -15,25 +15,26 @@ // along with this program. If not, see . use crate::{ - distributed::{cluster::Cluster, member::Service, retry_strategy::ExponentialBackoff}, entrypoint::search_server::{self, SearchService}, - image_store::Image, - inverted_index::{self, RetrievedWebpage}, + inverted_index::RetrievedWebpage, ranking::pipeline::{AsRankingWebsite, RankingWebsite}, Result, }; use std::{collections::HashMap, net::SocketAddr, sync::Arc, time::Duration}; +use distributed::{ + cluster::Cluster, + member::{Service, ShardId}, + retry_strategy::ExponentialBackoff, +}; use futures::stream::FuturesUnordered; use futures::StreamExt; +use imager::image_store::Image; use rand::seq::SliceRandom; -use serde::{Deserialize, Serialize}; use thiserror::Error; use url::Url; -use crate::distributed::sonic; - use super::{InitialWebsiteResult, SearchQuery}; const NUM_REPLICA_RETRIES: usize = 3; @@ -75,7 +76,7 @@ impl RemoteSearcher { async fn retrieve_websites( &self, - pointers: &[inverted_index::WebsitePointer], + pointers: &[collector::WebsitePointer], original_query: &str, ) -> Result> { let conn = self.conn().await; @@ -159,9 +160,6 @@ impl RemoteSearcher { } } -#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] -pub struct ShardId(u64); - pub struct Shard { id: ShardId, replicas: Vec, @@ -178,7 +176,7 @@ impl Shard { } Self { - id: ShardId(id), + id: ShardId::new(id), replicas: parsed_replicas, } } @@ -202,7 +200,7 @@ impl Shard { async fn retrieve_websites( &self, - pointers: &[inverted_index::WebsitePointer], + pointers: &[collector::WebsitePointer], original_query: &str, ) -> Result> { for _ in 0..NUM_REPLICA_RETRIES { @@ -249,6 +247,20 @@ pub struct ScoredWebsitePointer { pub shard: ShardId, } +impl collector::Doc for ScoredWebsitePointer { + fn score(&self) -> f64 { + self.as_ranking().score() + } + + fn id(&self) -> &tantivy::DocId { + self.as_ranking().id() + } + + fn hashes(&self) -> collector::Hashes { + self.as_ranking().hashes() + } +} + impl AsRankingWebsite for ScoredWebsitePointer { fn as_ranking(&self) -> &RankingWebsite { &self.website diff --git a/core/src/searcher/local.rs b/crates/core/src/searcher/local.rs similarity index 94% rename from core/src/searcher/local.rs rename to crates/core/src/searcher/local.rs index 481d603fb..b3288e552 100644 --- a/core/src/searcher/local.rs +++ b/crates/core/src/searcher/local.rs @@ -17,13 +17,17 @@ use std::collections::HashMap; use std::sync::Arc; +use entity_index::{EntityIndex, EntityMatch}; +use imager::image_store::Image; +use schema::TextField; +use spell::Spell; +use stract_config::{CollectorConfig, SnippetConfig}; use tantivy::schema::Value; use tantivy::TantivyDocument; use url::Url; +use webgraph::Node; +use webpage::region::Region; -use crate::config::{CollectorConfig, SnippetConfig}; -use crate::entity_index::{EntityIndex, EntityMatch}; -use crate::image_store::Image; use crate::index::Index; use crate::inverted_index::RetrievedWebpage; use crate::query::Query; @@ -34,12 +38,8 @@ use crate::ranking::models::lambdamart::LambdaMART; use crate::ranking::models::linear::LinearRegression; use crate::ranking::pipeline::{RankingPipeline, RankingWebsite}; use crate::ranking::{query_centrality, Ranker, Signal, SignalAggregator, ALL_SIGNALS}; -use crate::schema::TextField; use crate::search_ctx::Ctx; use crate::search_prettifier::{DisplayedEntity, DisplayedWebpage, HighlightedSpellCorrection}; -use crate::spell::Spell; -use crate::webgraph::Node; -use crate::webpage::region::Region; use crate::{inverted_index, Error, Result}; use super::WebsitesResult; @@ -84,7 +84,7 @@ impl LocalSearcher { } pub fn build_spell_dict(&mut self) { - self.spell = Some(Spell::for_index(&self.index)); + self.spell = Some(Spell::for_searcher(self.index.inverted_index.tv_searcher())); } pub fn set_entity_index(&mut self, entity_index: EntityIndex) { @@ -278,7 +278,7 @@ impl LocalSearcher { ) -> Result { let ctx = self.index.inverted_index.local_search_ctx(); let inverted_index_result = self.search_inverted_index(&ctx, query, de_rank_similar)?; - let correction = self.spell.as_ref().and_then(|s| s.correction(query)); + let correction = self.spell.as_ref().and_then(|s| s.correction(&query.query)); let sidebar = self.entity_sidebar(query); Ok(InitialWebsiteResult { @@ -292,7 +292,7 @@ impl LocalSearcher { pub fn retrieve_websites( &self, - websites: &[inverted_index::WebsitePointer], + websites: &[collector::WebsitePointer], query: &str, ) -> Result> { let ctx = self.index.inverted_index.local_search_ctx(); @@ -321,7 +321,7 @@ impl LocalSearcher { #[cfg(feature = "libtorch")] let pipeline = { use crate::ranking::models::cross_encoder::CrossEncoderModel; - match CrossEncoderModel::open("data/cross_encoder") { + match CrossEncoderModel::open("data/cross_encoder".as_ref()) { Ok(model) => RankingPipeline::reranking_for_query::( &mut search_query, Some(Arc::new(model)), @@ -415,10 +415,8 @@ impl LocalSearcher { #[cfg(test)] mod tests { - use crate::{ - searcher::NUM_RESULTS_PER_PAGE, - webpage::{Html, Webpage}, - }; + use crate::searcher::NUM_RESULTS_PER_PAGE; + use webpage::{Html, Webpage}; use super::*; @@ -518,19 +516,13 @@ mod tests { .spell .as_ref() .unwrap() - .correction(&SearchQuery { - query: "th best".to_string(), - ..Default::default() - }) + .correction("th best") .unwrap() ), "the best".to_string() ); assert_eq!( - searcher.spell.as_ref().unwrap().correction(&SearchQuery { - query: "the best".to_string(), - ..Default::default() - }), + searcher.spell.as_ref().unwrap().correction("the best"), None ); } diff --git a/core/src/searcher/mod.rs b/crates/core/src/searcher/mod.rs similarity index 95% rename from core/src/searcher/mod.rs rename to crates/core/src/searcher/mod.rs index 592d25521..b6b2c73e2 100644 --- a/core/src/searcher/mod.rs +++ b/crates/core/src/searcher/mod.rs @@ -19,21 +19,21 @@ pub mod distributed; pub mod local; pub use distributed::*; +use entity_index::EntityMatch; pub use local::*; use optics::{Optic, SiteRankings}; use serde::{Deserialize, Serialize}; +use spell::Correction; +use stract_config::defaults; +use stract_query::bangs::BangHit; use utoipa::ToSchema; +use webpage::region::Region; use crate::{ - bangs::BangHit, - config::defaults, - entity_index::EntityMatch, ranking::pipeline::RankingWebsite, search_prettifier::{ DisplayedAnswer, DisplayedSidebar, DisplayedWebpage, HighlightedSpellCorrection, }, - spell::Correction, - webpage::region::Region, widgets::Widget, }; diff --git a/core/src/searcher/stackoverflow.optic b/crates/core/src/searcher/stackoverflow.optic similarity index 100% rename from core/src/searcher/stackoverflow.optic rename to crates/core/src/searcher/stackoverflow.optic diff --git a/core/src/similar_sites.rs b/crates/core/src/similar_sites.rs similarity index 97% rename from core/src/similar_sites.rs rename to crates/core/src/similar_sites.rs index 7b4568a46..709335978 100644 --- a/core/src/similar_sites.rs +++ b/crates/core/src/similar_sites.rs @@ -15,16 +15,14 @@ // along with this program. If not, see . use std::{cmp::Reverse, collections::BinaryHeap, sync::Arc}; +use stdx::intmap::{IntMap, IntSet}; +use webgraph::{Node, NodeID, Webgraph}; +use webpage::url_ext::UrlExt; use hashbrown::HashSet; use url::Url; -use crate::{ - intmap::{IntMap, IntSet}, - ranking::inbound_similarity::InboundSimilarity, - webgraph::{Node, NodeID, Webgraph}, - webpage::url_ext::UrlExt, -}; +use crate::ranking::inbound_similarity::InboundSimilarity; #[derive(serde::Serialize, serde::Deserialize, Debug)] pub struct ScoredNode { diff --git a/core/src/snippet.rs b/crates/core/src/snippet.rs similarity index 94% rename from core/src/snippet.rs rename to crates/core/src/snippet.rs index 6e80a0a3b..371e2f966 100644 --- a/core/src/snippet.rs +++ b/crates/core/src/snippet.rs @@ -14,13 +14,12 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use spell::sentence_ranges; use std::ops::Range; +use stract_config::SnippetConfig; +use tokenizer::{BigramTokenizer, Normal, Stemmed, Tokenizer, TrigramTokenizer}; +use webpage::region::Region; -use crate::config::SnippetConfig; -use crate::query::Query; -use crate::spell::sentence_ranges; -use crate::tokenizer::{BigramTokenizer, Normal, Stemmed, Tokenizer, TrigramTokenizer}; -use crate::webpage::region::Region; use hashbrown::{HashMap, HashSet}; use utoipa::ToSchema; @@ -133,15 +132,15 @@ impl SnippetBuilder { } } -fn snippet_string_builder( +fn snippet_string_builder<'a>( text: &str, - terms: &[String], + terms: impl IntoIterator, lang: whatlang::Lang, config: SnippetConfig, mut tokenizer: Tokenizer, ) -> SnippetBuilder { let terms: HashSet = terms - .iter() + .into_iter() .flat_map(|term| { let mut stream = tantivy::tokenizer::Tokenizer::token_stream(&mut tokenizer, term); @@ -271,14 +270,14 @@ fn snippet_string_builder( snippet } -fn snippet_string( +fn snippet_string<'a>( text: &str, - terms: &[String], + terms: impl IntoIterator + Clone, lang: whatlang::Lang, config: SnippetConfig, ) -> TextSnippet { let tokenizer = Tokenizer::Normal(Normal::default()); - let snip = snippet_string_builder(text, terms, lang, config.clone(), tokenizer).build(); + let snip = snippet_string_builder(text, terms.clone(), lang, config.clone(), tokenizer).build(); if !snip.fragments.is_empty() && snip @@ -293,7 +292,12 @@ fn snippet_string( snippet_string_builder(text, terms, lang, config, tokenizer).build() } -pub fn generate(query: &Query, text: &str, region: &Region, config: SnippetConfig) -> TextSnippet { +pub fn generate<'a>( + terms: impl IntoIterator + Clone, + text: &str, + region: &Region, + config: SnippetConfig, +) -> TextSnippet { let lang = match region.lang() { Some(lang) => lang, None => match config.num_words_for_lang_detection { @@ -320,9 +324,9 @@ pub fn generate(query: &Query, text: &str, region: &Region, config: SnippetConfi match config.max_considered_words { Some(num_words) => { let text = text.split_whitespace().take(num_words).join(" "); - snippet_string(&text, query.simple_terms(), lang, config) + snippet_string(&text, terms, lang, config) } - None => snippet_string(text, query.simple_terms(), lang, config), + None => snippet_string(text, terms, lang, config), } } @@ -333,8 +337,8 @@ mod tests { index::Index, search_prettifier::Snippet, searcher::{LocalSearcher, SearchQuery}, - webpage::Webpage, }; + use webpage::Webpage; const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and @@ -499,7 +503,7 @@ Survey in 2016, 2017, and 2018."#; date: None, text: snippet_string( "this is a test", - &[], + [], whatlang::Lang::Eng, SnippetConfig::default() ) @@ -514,12 +518,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!( highlight(Snippet::Normal { date: None, - text: snippet_string( - "", - &["test".to_string()], - whatlang::Lang::Eng, - SnippetConfig::default() - ) + text: snippet_string("", ["test"], whatlang::Lang::Eng, SnippetConfig::default()) }) .as_str(), "" @@ -528,7 +527,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!( highlight(Snippet::Normal { date: None, - text: snippet_string("", &[], whatlang::Lang::Eng, SnippetConfig::default()) + text: snippet_string("", [], whatlang::Lang::Eng, SnippetConfig::default()) }) .as_str(), "" @@ -539,7 +538,7 @@ Survey in 2016, 2017, and 2018."#; fn compounded_terms() { let snip = snippet_string_builder( "this is a test", - &["thisis".to_string()], + ["thisis"], whatlang::Lang::Eng, SnippetConfig::default(), Tokenizer::Normal(Normal::default()), diff --git a/core/src/subdomain_count.rs b/crates/core/src/subdomain_count.rs similarity index 90% rename from core/src/subdomain_count.rs rename to crates/core/src/subdomain_count.rs index 5ab10e53c..751dab6e8 100644 --- a/core/src/subdomain_count.rs +++ b/crates/core/src/subdomain_count.rs @@ -14,20 +14,18 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::{ - kv::{rocksdb_store::RocksDbStore, Kv}, - prehashed::{hash, Prehashed}, - webpage::url_ext::UrlExt, -}; +use kv::{rocksdb_store::RocksDbStore, Kv}; use std::{collections::HashSet, path::Path}; +use stdx::prehashed::{hash, Prehashed}; use url::Url; +use webpage::url_ext::UrlExt; pub struct SubdomainCounter { inner: Box>>, } impl SubdomainCounter { - pub fn open>(path: P) -> Self { + pub fn open(path: &Path) -> Self { Self { inner: Box::new(RocksDbStore::open(path)), } diff --git a/core/src/ttl_cache.rs b/crates/core/src/ttl_cache.rs similarity index 100% rename from core/src/ttl_cache.rs rename to crates/core/src/ttl_cache.rs diff --git a/core/src/widgets/calculator.rs b/crates/core/src/widgets/calculator.rs similarity index 100% rename from core/src/widgets/calculator.rs rename to crates/core/src/widgets/calculator.rs diff --git a/core/src/widgets/mod.rs b/crates/core/src/widgets/mod.rs similarity index 98% rename from core/src/widgets/mod.rs rename to crates/core/src/widgets/mod.rs index 2fcea1f09..3213a5387 100644 --- a/core/src/widgets/mod.rs +++ b/crates/core/src/widgets/mod.rs @@ -15,11 +15,11 @@ // along with this program. If not, see . use serde::{Deserialize, Serialize}; +use stract_config::WidgetsConfig; use thiserror::Error; use utoipa::ToSchema; use self::thesaurus::ThesaurusWidget; -use crate::config::WidgetsConfig; use self::calculator::{Calculation, Calculator}; use anyhow::{anyhow, Result}; diff --git a/core/src/widgets/thesaurus.rs b/crates/core/src/widgets/thesaurus.rs similarity index 98% rename from core/src/widgets/thesaurus.rs rename to crates/core/src/widgets/thesaurus.rs index 791d27f49..f037a1a56 100644 --- a/core/src/widgets/thesaurus.rs +++ b/crates/core/src/widgets/thesaurus.rs @@ -276,7 +276,7 @@ impl Dictionary { self.spellings.insert(normalized, lemma); } - pub fn build>(path: P) -> Result { + pub fn build(path: &Path) -> Result { let reader = BufReader::new(File::open(path)?); let mut parser = TurtleParser::new(reader, None); @@ -473,7 +473,8 @@ mod tests { #[test] fn build_dict() { - let dict = Dictionary::build("../data/english-wordnet-2022-subset.ttl").unwrap(); + let dict = + Dictionary::build("../../data/english-wordnet-2022-subset.ttl".as_ref()).unwrap(); let infos = dict.get(Lemma("barely".to_string())); diff --git a/core/stopwords/Afrikaans.txt b/crates/core/stopwords/Afrikaans.txt similarity index 100% rename from core/stopwords/Afrikaans.txt rename to crates/core/stopwords/Afrikaans.txt diff --git a/core/stopwords/Albanian.txt b/crates/core/stopwords/Albanian.txt similarity index 100% rename from core/stopwords/Albanian.txt rename to crates/core/stopwords/Albanian.txt diff --git a/core/stopwords/Arabic.txt b/crates/core/stopwords/Arabic.txt similarity index 100% rename from core/stopwords/Arabic.txt rename to crates/core/stopwords/Arabic.txt diff --git a/core/stopwords/Aragonese.txt b/crates/core/stopwords/Aragonese.txt similarity index 100% rename from core/stopwords/Aragonese.txt rename to crates/core/stopwords/Aragonese.txt diff --git a/core/stopwords/Armenian.txt b/crates/core/stopwords/Armenian.txt similarity index 100% rename from core/stopwords/Armenian.txt rename to crates/core/stopwords/Armenian.txt diff --git a/core/stopwords/Aromanian.txt b/crates/core/stopwords/Aromanian.txt similarity index 100% rename from core/stopwords/Aromanian.txt rename to crates/core/stopwords/Aromanian.txt diff --git a/core/stopwords/Asturian.txt b/crates/core/stopwords/Asturian.txt similarity index 100% rename from core/stopwords/Asturian.txt rename to crates/core/stopwords/Asturian.txt diff --git a/core/stopwords/Azerbaijani.txt b/crates/core/stopwords/Azerbaijani.txt similarity index 100% rename from core/stopwords/Azerbaijani.txt rename to crates/core/stopwords/Azerbaijani.txt diff --git a/core/stopwords/Basque.txt b/crates/core/stopwords/Basque.txt similarity index 100% rename from core/stopwords/Basque.txt rename to crates/core/stopwords/Basque.txt diff --git a/core/stopwords/Belarusian.txt b/crates/core/stopwords/Belarusian.txt similarity index 100% rename from core/stopwords/Belarusian.txt rename to crates/core/stopwords/Belarusian.txt diff --git a/core/stopwords/Belarusian_Taraskievica.txt b/crates/core/stopwords/Belarusian_Taraskievica.txt similarity index 100% rename from core/stopwords/Belarusian_Taraskievica.txt rename to crates/core/stopwords/Belarusian_Taraskievica.txt diff --git a/core/stopwords/Bengali.txt b/crates/core/stopwords/Bengali.txt similarity index 100% rename from core/stopwords/Bengali.txt rename to crates/core/stopwords/Bengali.txt diff --git a/core/stopwords/Bishnupriya_Manipuri.txt b/crates/core/stopwords/Bishnupriya_Manipuri.txt similarity index 100% rename from core/stopwords/Bishnupriya_Manipuri.txt rename to crates/core/stopwords/Bishnupriya_Manipuri.txt diff --git a/core/stopwords/Bosnian.txt b/crates/core/stopwords/Bosnian.txt similarity index 100% rename from core/stopwords/Bosnian.txt rename to crates/core/stopwords/Bosnian.txt diff --git a/core/stopwords/Breton.txt b/crates/core/stopwords/Breton.txt similarity index 100% rename from core/stopwords/Breton.txt rename to crates/core/stopwords/Breton.txt diff --git a/core/stopwords/Bulgarian.txt b/crates/core/stopwords/Bulgarian.txt similarity index 100% rename from core/stopwords/Bulgarian.txt rename to crates/core/stopwords/Bulgarian.txt diff --git a/core/stopwords/Catalan.txt b/crates/core/stopwords/Catalan.txt similarity index 100% rename from core/stopwords/Catalan.txt rename to crates/core/stopwords/Catalan.txt diff --git a/core/stopwords/Cebuano.txt b/crates/core/stopwords/Cebuano.txt similarity index 100% rename from core/stopwords/Cebuano.txt rename to crates/core/stopwords/Cebuano.txt diff --git a/core/stopwords/Chuvash.txt b/crates/core/stopwords/Chuvash.txt similarity index 100% rename from core/stopwords/Chuvash.txt rename to crates/core/stopwords/Chuvash.txt diff --git a/core/stopwords/Croatian.txt b/crates/core/stopwords/Croatian.txt similarity index 100% rename from core/stopwords/Croatian.txt rename to crates/core/stopwords/Croatian.txt diff --git a/core/stopwords/Czech.txt b/crates/core/stopwords/Czech.txt similarity index 100% rename from core/stopwords/Czech.txt rename to crates/core/stopwords/Czech.txt diff --git a/core/stopwords/Danish.txt b/crates/core/stopwords/Danish.txt similarity index 100% rename from core/stopwords/Danish.txt rename to crates/core/stopwords/Danish.txt diff --git a/core/stopwords/Dutch.txt b/crates/core/stopwords/Dutch.txt similarity index 100% rename from core/stopwords/Dutch.txt rename to crates/core/stopwords/Dutch.txt diff --git a/core/stopwords/English.txt b/crates/core/stopwords/English.txt similarity index 100% rename from core/stopwords/English.txt rename to crates/core/stopwords/English.txt diff --git a/core/stopwords/Esperanto.txt b/crates/core/stopwords/Esperanto.txt similarity index 100% rename from core/stopwords/Esperanto.txt rename to crates/core/stopwords/Esperanto.txt diff --git a/core/stopwords/Estonian.txt b/crates/core/stopwords/Estonian.txt similarity index 100% rename from core/stopwords/Estonian.txt rename to crates/core/stopwords/Estonian.txt diff --git a/core/stopwords/Finnish.txt b/crates/core/stopwords/Finnish.txt similarity index 100% rename from core/stopwords/Finnish.txt rename to crates/core/stopwords/Finnish.txt diff --git a/core/stopwords/French.txt b/crates/core/stopwords/French.txt similarity index 100% rename from core/stopwords/French.txt rename to crates/core/stopwords/French.txt diff --git a/core/stopwords/Galician.txt b/crates/core/stopwords/Galician.txt similarity index 100% rename from core/stopwords/Galician.txt rename to crates/core/stopwords/Galician.txt diff --git a/core/stopwords/Georgian.txt b/crates/core/stopwords/Georgian.txt similarity index 100% rename from core/stopwords/Georgian.txt rename to crates/core/stopwords/Georgian.txt diff --git a/core/stopwords/German.txt b/crates/core/stopwords/German.txt similarity index 100% rename from core/stopwords/German.txt rename to crates/core/stopwords/German.txt diff --git a/core/stopwords/Greek.txt b/crates/core/stopwords/Greek.txt similarity index 100% rename from core/stopwords/Greek.txt rename to crates/core/stopwords/Greek.txt diff --git a/core/stopwords/Gujarati.txt b/crates/core/stopwords/Gujarati.txt similarity index 100% rename from core/stopwords/Gujarati.txt rename to crates/core/stopwords/Gujarati.txt diff --git a/core/stopwords/Haitian.txt b/crates/core/stopwords/Haitian.txt similarity index 100% rename from core/stopwords/Haitian.txt rename to crates/core/stopwords/Haitian.txt diff --git a/core/stopwords/Hebrew.txt b/crates/core/stopwords/Hebrew.txt similarity index 100% rename from core/stopwords/Hebrew.txt rename to crates/core/stopwords/Hebrew.txt diff --git a/core/stopwords/Hindi.txt b/crates/core/stopwords/Hindi.txt similarity index 100% rename from core/stopwords/Hindi.txt rename to crates/core/stopwords/Hindi.txt diff --git a/core/stopwords/Hungarian.txt b/crates/core/stopwords/Hungarian.txt similarity index 100% rename from core/stopwords/Hungarian.txt rename to crates/core/stopwords/Hungarian.txt diff --git a/core/stopwords/Icelandic.txt b/crates/core/stopwords/Icelandic.txt similarity index 100% rename from core/stopwords/Icelandic.txt rename to crates/core/stopwords/Icelandic.txt diff --git a/core/stopwords/Ido.txt b/crates/core/stopwords/Ido.txt similarity index 100% rename from core/stopwords/Ido.txt rename to crates/core/stopwords/Ido.txt diff --git a/core/stopwords/Igbo.txt b/crates/core/stopwords/Igbo.txt similarity index 100% rename from core/stopwords/Igbo.txt rename to crates/core/stopwords/Igbo.txt diff --git a/core/stopwords/Indonesian.txt b/crates/core/stopwords/Indonesian.txt similarity index 100% rename from core/stopwords/Indonesian.txt rename to crates/core/stopwords/Indonesian.txt diff --git a/core/stopwords/Irish.txt b/crates/core/stopwords/Irish.txt similarity index 100% rename from core/stopwords/Irish.txt rename to crates/core/stopwords/Irish.txt diff --git a/core/stopwords/Italian.txt b/crates/core/stopwords/Italian.txt similarity index 100% rename from core/stopwords/Italian.txt rename to crates/core/stopwords/Italian.txt diff --git a/core/stopwords/Japanese.txt b/crates/core/stopwords/Japanese.txt similarity index 100% rename from core/stopwords/Japanese.txt rename to crates/core/stopwords/Japanese.txt diff --git a/core/stopwords/Javanese.txt b/crates/core/stopwords/Javanese.txt similarity index 100% rename from core/stopwords/Javanese.txt rename to crates/core/stopwords/Javanese.txt diff --git a/core/stopwords/Kannada.txt b/crates/core/stopwords/Kannada.txt similarity index 100% rename from core/stopwords/Kannada.txt rename to crates/core/stopwords/Kannada.txt diff --git a/core/stopwords/Kazakh.txt b/crates/core/stopwords/Kazakh.txt similarity index 100% rename from core/stopwords/Kazakh.txt rename to crates/core/stopwords/Kazakh.txt diff --git a/core/stopwords/Korean.txt b/crates/core/stopwords/Korean.txt similarity index 100% rename from core/stopwords/Korean.txt rename to crates/core/stopwords/Korean.txt diff --git a/core/stopwords/Kurdish.txt b/crates/core/stopwords/Kurdish.txt similarity index 100% rename from core/stopwords/Kurdish.txt rename to crates/core/stopwords/Kurdish.txt diff --git a/core/stopwords/Kyrgyz.txt b/crates/core/stopwords/Kyrgyz.txt similarity index 100% rename from core/stopwords/Kyrgyz.txt rename to crates/core/stopwords/Kyrgyz.txt diff --git a/core/stopwords/Latin.txt b/crates/core/stopwords/Latin.txt similarity index 100% rename from core/stopwords/Latin.txt rename to crates/core/stopwords/Latin.txt diff --git a/core/stopwords/Latvian.txt b/crates/core/stopwords/Latvian.txt similarity index 100% rename from core/stopwords/Latvian.txt rename to crates/core/stopwords/Latvian.txt diff --git a/core/stopwords/Lithuanian.txt b/crates/core/stopwords/Lithuanian.txt similarity index 100% rename from core/stopwords/Lithuanian.txt rename to crates/core/stopwords/Lithuanian.txt diff --git a/core/stopwords/Lombard.txt b/crates/core/stopwords/Lombard.txt similarity index 100% rename from core/stopwords/Lombard.txt rename to crates/core/stopwords/Lombard.txt diff --git a/core/stopwords/Low_Saxon.txt b/crates/core/stopwords/Low_Saxon.txt similarity index 100% rename from core/stopwords/Low_Saxon.txt rename to crates/core/stopwords/Low_Saxon.txt diff --git a/core/stopwords/Luxembourgish.txt b/crates/core/stopwords/Luxembourgish.txt similarity index 100% rename from core/stopwords/Luxembourgish.txt rename to crates/core/stopwords/Luxembourgish.txt diff --git a/core/stopwords/Macedonian.txt b/crates/core/stopwords/Macedonian.txt similarity index 100% rename from core/stopwords/Macedonian.txt rename to crates/core/stopwords/Macedonian.txt diff --git a/core/stopwords/Malay.txt b/crates/core/stopwords/Malay.txt similarity index 100% rename from core/stopwords/Malay.txt rename to crates/core/stopwords/Malay.txt diff --git a/core/stopwords/Malayalam.txt b/crates/core/stopwords/Malayalam.txt similarity index 100% rename from core/stopwords/Malayalam.txt rename to crates/core/stopwords/Malayalam.txt diff --git a/core/stopwords/Maltese.txt b/crates/core/stopwords/Maltese.txt similarity index 100% rename from core/stopwords/Maltese.txt rename to crates/core/stopwords/Maltese.txt diff --git a/core/stopwords/Marathi.txt b/crates/core/stopwords/Marathi.txt similarity index 100% rename from core/stopwords/Marathi.txt rename to crates/core/stopwords/Marathi.txt diff --git a/core/stopwords/Neapolitan.txt b/crates/core/stopwords/Neapolitan.txt similarity index 100% rename from core/stopwords/Neapolitan.txt rename to crates/core/stopwords/Neapolitan.txt diff --git a/core/stopwords/Nepali.txt b/crates/core/stopwords/Nepali.txt similarity index 100% rename from core/stopwords/Nepali.txt rename to crates/core/stopwords/Nepali.txt diff --git a/core/stopwords/Newar.txt b/crates/core/stopwords/Newar.txt similarity index 100% rename from core/stopwords/Newar.txt rename to crates/core/stopwords/Newar.txt diff --git a/core/stopwords/Norwegian_Bokmal.txt b/crates/core/stopwords/Norwegian_Bokmal.txt similarity index 100% rename from core/stopwords/Norwegian_Bokmal.txt rename to crates/core/stopwords/Norwegian_Bokmal.txt diff --git a/core/stopwords/Norwegian_Nynorsk.txt b/crates/core/stopwords/Norwegian_Nynorsk.txt similarity index 100% rename from core/stopwords/Norwegian_Nynorsk.txt rename to crates/core/stopwords/Norwegian_Nynorsk.txt diff --git a/core/stopwords/Occitan.txt b/crates/core/stopwords/Occitan.txt similarity index 100% rename from core/stopwords/Occitan.txt rename to crates/core/stopwords/Occitan.txt diff --git a/core/stopwords/Persian.txt b/crates/core/stopwords/Persian.txt similarity index 100% rename from core/stopwords/Persian.txt rename to crates/core/stopwords/Persian.txt diff --git a/core/stopwords/Piedmontese.txt b/crates/core/stopwords/Piedmontese.txt similarity index 100% rename from core/stopwords/Piedmontese.txt rename to crates/core/stopwords/Piedmontese.txt diff --git a/core/stopwords/Polish.txt b/crates/core/stopwords/Polish.txt similarity index 100% rename from core/stopwords/Polish.txt rename to crates/core/stopwords/Polish.txt diff --git a/core/stopwords/Portuguese.txt b/crates/core/stopwords/Portuguese.txt similarity index 100% rename from core/stopwords/Portuguese.txt rename to crates/core/stopwords/Portuguese.txt diff --git a/core/stopwords/Quechua.txt b/crates/core/stopwords/Quechua.txt similarity index 100% rename from core/stopwords/Quechua.txt rename to crates/core/stopwords/Quechua.txt diff --git a/core/stopwords/Romanian.txt b/crates/core/stopwords/Romanian.txt similarity index 100% rename from core/stopwords/Romanian.txt rename to crates/core/stopwords/Romanian.txt diff --git a/core/stopwords/Russian.txt b/crates/core/stopwords/Russian.txt similarity index 100% rename from core/stopwords/Russian.txt rename to crates/core/stopwords/Russian.txt diff --git a/core/stopwords/Samogitian.txt b/crates/core/stopwords/Samogitian.txt similarity index 100% rename from core/stopwords/Samogitian.txt rename to crates/core/stopwords/Samogitian.txt diff --git a/core/stopwords/Serbian.txt b/crates/core/stopwords/Serbian.txt similarity index 100% rename from core/stopwords/Serbian.txt rename to crates/core/stopwords/Serbian.txt diff --git a/core/stopwords/Serbo_Croatian.txt b/crates/core/stopwords/Serbo_Croatian.txt similarity index 100% rename from core/stopwords/Serbo_Croatian.txt rename to crates/core/stopwords/Serbo_Croatian.txt diff --git a/core/stopwords/Sicilian.txt b/crates/core/stopwords/Sicilian.txt similarity index 100% rename from core/stopwords/Sicilian.txt rename to crates/core/stopwords/Sicilian.txt diff --git a/core/stopwords/Simple_English.txt b/crates/core/stopwords/Simple_English.txt similarity index 100% rename from core/stopwords/Simple_English.txt rename to crates/core/stopwords/Simple_English.txt diff --git a/core/stopwords/Slovak.txt b/crates/core/stopwords/Slovak.txt similarity index 100% rename from core/stopwords/Slovak.txt rename to crates/core/stopwords/Slovak.txt diff --git a/core/stopwords/Slovenian.txt b/crates/core/stopwords/Slovenian.txt similarity index 100% rename from core/stopwords/Slovenian.txt rename to crates/core/stopwords/Slovenian.txt diff --git a/core/stopwords/Spanish.txt b/crates/core/stopwords/Spanish.txt similarity index 100% rename from core/stopwords/Spanish.txt rename to crates/core/stopwords/Spanish.txt diff --git a/core/stopwords/Sundanese.txt b/crates/core/stopwords/Sundanese.txt similarity index 100% rename from core/stopwords/Sundanese.txt rename to crates/core/stopwords/Sundanese.txt diff --git a/core/stopwords/Swahili.txt b/crates/core/stopwords/Swahili.txt similarity index 100% rename from core/stopwords/Swahili.txt rename to crates/core/stopwords/Swahili.txt diff --git a/core/stopwords/Swedish.txt b/crates/core/stopwords/Swedish.txt similarity index 100% rename from core/stopwords/Swedish.txt rename to crates/core/stopwords/Swedish.txt diff --git a/core/stopwords/Tagalog.txt b/crates/core/stopwords/Tagalog.txt similarity index 100% rename from core/stopwords/Tagalog.txt rename to crates/core/stopwords/Tagalog.txt diff --git a/core/stopwords/Tamil.txt b/crates/core/stopwords/Tamil.txt similarity index 100% rename from core/stopwords/Tamil.txt rename to crates/core/stopwords/Tamil.txt diff --git a/core/stopwords/Telugu.txt b/crates/core/stopwords/Telugu.txt similarity index 100% rename from core/stopwords/Telugu.txt rename to crates/core/stopwords/Telugu.txt diff --git a/core/stopwords/Turkish.txt b/crates/core/stopwords/Turkish.txt similarity index 100% rename from core/stopwords/Turkish.txt rename to crates/core/stopwords/Turkish.txt diff --git a/core/stopwords/Turkmen.txt b/crates/core/stopwords/Turkmen.txt similarity index 100% rename from core/stopwords/Turkmen.txt rename to crates/core/stopwords/Turkmen.txt diff --git a/core/stopwords/Ukrainian.txt b/crates/core/stopwords/Ukrainian.txt similarity index 100% rename from core/stopwords/Ukrainian.txt rename to crates/core/stopwords/Ukrainian.txt diff --git a/core/stopwords/Urdu.txt b/crates/core/stopwords/Urdu.txt similarity index 100% rename from core/stopwords/Urdu.txt rename to crates/core/stopwords/Urdu.txt diff --git a/core/stopwords/Uzbek.txt b/crates/core/stopwords/Uzbek.txt similarity index 100% rename from core/stopwords/Uzbek.txt rename to crates/core/stopwords/Uzbek.txt diff --git a/core/stopwords/Vietnamese.txt b/crates/core/stopwords/Vietnamese.txt similarity index 100% rename from core/stopwords/Vietnamese.txt rename to crates/core/stopwords/Vietnamese.txt diff --git a/core/stopwords/Volapuk.txt b/crates/core/stopwords/Volapuk.txt similarity index 100% rename from core/stopwords/Volapuk.txt rename to crates/core/stopwords/Volapuk.txt diff --git a/core/stopwords/Walloon.txt b/crates/core/stopwords/Walloon.txt similarity index 100% rename from core/stopwords/Walloon.txt rename to crates/core/stopwords/Walloon.txt diff --git a/core/stopwords/Waray_Waray.txt b/crates/core/stopwords/Waray_Waray.txt similarity index 100% rename from core/stopwords/Waray_Waray.txt rename to crates/core/stopwords/Waray_Waray.txt diff --git a/core/stopwords/Welsh.txt b/crates/core/stopwords/Welsh.txt similarity index 100% rename from core/stopwords/Welsh.txt rename to crates/core/stopwords/Welsh.txt diff --git a/core/stopwords/West_Frisian.txt b/crates/core/stopwords/West_Frisian.txt similarity index 100% rename from core/stopwords/West_Frisian.txt rename to crates/core/stopwords/West_Frisian.txt diff --git a/core/stopwords/Western_Panjabi.txt b/crates/core/stopwords/Western_Panjabi.txt similarity index 100% rename from core/stopwords/Western_Panjabi.txt rename to crates/core/stopwords/Western_Panjabi.txt diff --git a/core/stopwords/Yoruba.txt b/crates/core/stopwords/Yoruba.txt similarity index 100% rename from core/stopwords/Yoruba.txt rename to crates/core/stopwords/Yoruba.txt diff --git a/core/testcases/entity/algorithm.txt b/crates/core/testcases/entity/algorithm.txt similarity index 100% rename from core/testcases/entity/algorithm.txt rename to crates/core/testcases/entity/algorithm.txt diff --git a/core/testcases/entity/andre.txt b/crates/core/testcases/entity/andre.txt similarity index 100% rename from core/testcases/entity/andre.txt rename to crates/core/testcases/entity/andre.txt diff --git a/core/testcases/entity/aristotle.txt b/crates/core/testcases/entity/aristotle.txt similarity index 100% rename from core/testcases/entity/aristotle.txt rename to crates/core/testcases/entity/aristotle.txt diff --git a/core/testcases/entity/disambiguation.txt b/crates/core/testcases/entity/disambiguation.txt similarity index 100% rename from core/testcases/entity/disambiguation.txt rename to crates/core/testcases/entity/disambiguation.txt diff --git a/core/testcases/entity/obama.txt b/crates/core/testcases/entity/obama.txt similarity index 100% rename from core/testcases/entity/obama.txt rename to crates/core/testcases/entity/obama.txt diff --git a/core/testcases/lambdamart.txt b/crates/core/testcases/lambdamart.txt similarity index 100% rename from core/testcases/lambdamart.txt rename to crates/core/testcases/lambdamart.txt diff --git a/core/testcases/parsing/5390001.html b/crates/core/testcases/parsing/5390001.html similarity index 100% rename from core/testcases/parsing/5390001.html rename to crates/core/testcases/parsing/5390001.html diff --git a/core/testcases/parsing/77p2p-7.live-105.html b/crates/core/testcases/parsing/77p2p-7.live-105.html similarity index 100% rename from core/testcases/parsing/77p2p-7.live-105.html rename to crates/core/testcases/parsing/77p2p-7.live-105.html diff --git a/core/testcases/parsing/byte_index_out_of_bounds.html b/crates/core/testcases/parsing/byte_index_out_of_bounds.html similarity index 100% rename from core/testcases/parsing/byte_index_out_of_bounds.html rename to crates/core/testcases/parsing/byte_index_out_of_bounds.html diff --git a/core/testcases/parsing/reddit.html b/crates/core/testcases/parsing/reddit.html similarity index 100% rename from core/testcases/parsing/reddit.html rename to crates/core/testcases/parsing/reddit.html diff --git a/core/testcases/parsing/whut.html b/crates/core/testcases/parsing/whut.html similarity index 100% rename from core/testcases/parsing/whut.html rename to crates/core/testcases/parsing/whut.html diff --git a/core/testcases/parsing/yasudaya.html b/crates/core/testcases/parsing/yasudaya.html similarity index 100% rename from core/testcases/parsing/yasudaya.html rename to crates/core/testcases/parsing/yasudaya.html diff --git a/core/testcases/schema_org/infinity_war.html b/crates/core/testcases/schema_org/infinity_war.html similarity index 100% rename from core/testcases/schema_org/infinity_war.html rename to crates/core/testcases/schema_org/infinity_war.html diff --git a/core/testcases/schema_org/recipe.html b/crates/core/testcases/schema_org/recipe.html similarity index 100% rename from core/testcases/schema_org/recipe.html rename to crates/core/testcases/schema_org/recipe.html diff --git a/core/testcases/schema_org/stackoverflow.html b/crates/core/testcases/schema_org/stackoverflow.html similarity index 100% rename from core/testcases/schema_org/stackoverflow.html rename to crates/core/testcases/schema_org/stackoverflow.html diff --git a/core/testcases/schema_org/stackoverflow_with_code.html b/crates/core/testcases/schema_org/stackoverflow_with_code.html similarity index 100% rename from core/testcases/schema_org/stackoverflow_with_code.html rename to crates/core/testcases/schema_org/stackoverflow_with_code.html diff --git a/crates/core/tests/common.rs b/crates/core/tests/common.rs new file mode 100644 index 000000000..494077c73 --- /dev/null +++ b/crates/core/tests/common.rs @@ -0,0 +1,27 @@ +use stract_core::index::Index; + +pub type Result = std::result::Result; + +pub fn temporary_index() -> Result { + let path = stdx::gen_temp_path(); + Index::open(&path) +} + +pub fn rand_words(num_words: usize) -> String { + use rand::{distributions::Alphanumeric, Rng}; + let mut res = String::new(); + + for _ in 0..num_words { + res.push_str( + rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(30) + .map(char::from) + .collect::() + .as_str(), + ); + res.push(' '); + } + + res.trim().to_string() +} diff --git a/crates/core/tests/optics.rs b/crates/core/tests/optics.rs new file mode 100644 index 000000000..f3b1ae78c --- /dev/null +++ b/crates/core/tests/optics.rs @@ -0,0 +1,1564 @@ +mod common; + +use optics::{Optic, SiteRankings}; +use webgraph::{Node, WebgraphWriter}; +use webpage::{Html, Webpage}; + +use stract_core::{ + ranking::inbound_similarity::InboundSimilarity, + searcher::{LocalSearcher, SearchQuery}, +}; + +use crate::common::{rand_words, temporary_index}; + +const CONTENT: &str = "this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever"; + +#[test] +fn discard_and_boost_sites() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website A + + + {CONTENT} {} + example example example + + + "#, + rand_words(100) + ), + "https://www.a.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website B + + + {CONTENT} {} + + + "#, + rand_words(100) + ), + "https://www.b.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.01, + page_centrality: 0.0, + + pre_computed_score: 0.0, + fetch_time_ms: 500, + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + + index.commit().expect("failed to commit index"); + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 2); + assert_eq!(res[0].url, "https://www.b.com/"); + assert_eq!(res[1].url, "https://www.a.com/"); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse( + r#" + Rule { + Matches { + Domain("b.com") + }, + Action(Discard) + } + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 1); + assert_eq!(res[0].url, "https://www.a.com/"); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse( + r#" + Rule { + Matches { + Domain("a.com") + }, + Action(Boost(10)) + } + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 2); + assert_eq!(res[0].url, "https://www.a.com/"); + assert_eq!(res[1].url, "https://www.b.com/"); +} + +#[test] +fn example_optics_dont_crash() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website A + + + {CONTENT} + example example example + + + "# + ), + "https://www.a.com/this/is/a/pattern", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website B + + + {CONTENT} + + + "# + ), + "https://www.b.com/this/is/b/pattern", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0001, + page_centrality: 0.0, + + pre_computed_score: 0.0, + fetch_time_ms: 500, + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + + index.commit().expect("failed to commit index"); + let searcher = LocalSearcher::from(index); + + let _ = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse(include_str!( + "../../optics/testcases/samples/quickstart.optic" + )) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + let _ = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse(include_str!( + "../../optics/testcases/samples/hacker_news.optic" + )) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + let _ = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse(include_str!( + "../../optics/testcases/samples/copycats_removal.optic" + )) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; +} + +#[test] +fn empty_discard() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website A + + + {CONTENT} {} + example example example + + + "#, + rand_words(100) + ), + "https://www.a.com/this/is/a/pattern", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website B + + + {CONTENT} {} + + + "#, + rand_words(100) + ), + "https://www.b.com/this/is/b/pattern", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0001, + page_centrality: 0.0, + + pre_computed_score: 0.0, + fetch_time_ms: 500, + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website B + + + {CONTENT} {} + + + "#, + rand_words(100) + ), + "https://www.c.com/this/is/c/pattern", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0001, + page_centrality: 0.0, + + pre_computed_score: 0.0, + fetch_time_ms: 500, + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + + index.commit().expect("failed to commit index"); + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse( + r#" + DiscardNonMatching; + Rule { + Matches { + Domain("a.com") + }, + Action(Boost(6)) + }; + Rule { + Matches { + Domain("b.com") + }, + Action(Boost(1)) + }; + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 2); + assert_eq!(res[0].url, "https://www.a.com/this/is/a/pattern"); +} + +#[test] +fn liked_sites() { + let mut index = temporary_index().expect("Unable to open index"); + + let mut writer = WebgraphWriter::new( + &stdx::gen_temp_path(), + executor::Executor::single_thread(), + webgraph::Compression::default(), + ); + + writer.insert( + Node::from("https://www.e.com").into_host(), + Node::from("https://www.a.com").into_host(), + String::new(), + ); + writer.insert( + Node::from("https://www.a.com").into_host(), + Node::from("https://www.e.com").into_host(), + String::new(), + ); + + writer.insert( + Node::from("https://www.c.com").into_host(), + Node::from("https://www.c.com").into_host(), + String::new(), + ); + + writer.insert( + Node::from("https://www.b.com").into_host(), + Node::from("https://www.e.com").into_host(), + String::new(), + ); + writer.insert( + Node::from("https://www.e.com").into_host(), + Node::from("https://www.b.com").into_host(), + String::new(), + ); + + let graph = writer.finalize(); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website A + + + {CONTENT} {} + example example example + + + "#, + rand_words(100) + ), + "https://www.a.com/this/is/a/pattern", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + dmoz_description: None, + safety_classification: None, + node_id: Some(Node::from("www.a.com").into_host().id()), + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website B + + + {CONTENT} {} + + + "#, + rand_words(100) + ), + "https://www.b.com/this/is/b/pattern", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0001, + page_centrality: 0.0, + + pre_computed_score: 0.0, + fetch_time_ms: 500, + dmoz_description: None, + safety_classification: None, + node_id: Some(Node::from("www.b.com").into_host().id()), + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website C + + + {CONTENT} {} + + + "#, + rand_words(100) + ), + "https://www.c.com/this/is/c/pattern", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0002, + page_centrality: 0.0, + + pre_computed_score: 0.0, + fetch_time_ms: 500, + dmoz_description: None, + safety_classification: None, + node_id: Some(Node::from("www.c.com").into_host().id()), + }) + .expect("failed to insert webpage"); + + index.commit().expect("failed to commit index"); + let mut searcher = LocalSearcher::from(index); + + searcher.set_inbound_similarity(InboundSimilarity::build(&graph)); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse( + r#" + Like(Site("www.a.com")); + Like(Site("www.b.com")); + Dislike(Site("www.c.com")); + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 3); + assert_eq!(res[0].url, "https://www.b.com/this/is/b/pattern"); + assert_eq!(res[1].url, "https://www.a.com/this/is/a/pattern"); + assert_eq!(res[2].url, "https://www.c.com/this/is/c/pattern"); +} + +#[test] +fn schema_org_search() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website A + + + + {CONTENT} {} + example example example + + + "#, + rand_words(100) + ), + "https://www.a.com/", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r##" + + + Website B + + +
+
+

Comments

+
+ +
+

Posted by: + Greg +

+

+
+

Ha!

+
+
+
+ {CONTENT} {} + + + "##, + rand_words(100) + ), + "https://www.b.com/", + ).unwrap(), + backlink_labels: vec![], + host_centrality: 0.0001, + page_centrality: 0.0, + + pre_computed_score: 0.0, + fetch_time_ms: 500, + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + + index.commit().unwrap(); + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse( + r#" + DiscardNonMatching; + Rule { + Matches { + Schema("BlogPosting") + } + } + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 1); + assert_eq!(res[0].url, "https://www.b.com/"); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse( + r#" + DiscardNonMatching; + Rule { + Matches { + Schema("BlogPosting.comment") + } + } + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 1); + assert_eq!(res[0].url, "https://www.b.com/"); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse( + r#" + DiscardNonMatching; + Rule { + Matches { + Schema("ImageObject") + } + } + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 1); + assert_eq!(res[0].url, "https://www.a.com/"); + + let res = searcher + .search(&SearchQuery { + query: "website".to_string(), + optic: Some( + Optic::parse( + r#" + DiscardNonMatching; + Rule { + Matches { + Schema("Person") + } + } + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 1); + assert_eq!(res[0].url, "https://www.b.com/"); +} + +#[test] +fn pattern_same_phrase() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website A + + + {CONTENT} {} + example example example + + + "#, + rand_words(100) + ), + "https://chat.stackoverflow.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + + index.commit().expect("failed to commit index"); + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "site:stackoverflow.com".to_string(), + optic: Some( + Optic::parse( + r#" + DiscardNonMatching; + Rule { + Matches { + Site("a.com") + }, + Action(Boost(6)) + }; + Rule { + Matches { + Site("stackoverflow.blog") + }, + Action(Boost(1)) + }; + Rule { + Matches { + Site("chat.b.eu") + }, + Action(Boost(1)) + }; + "#, + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 0); +} + +#[test] +fn discard_all_discard_like() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website A + + + {CONTENT} {} + example example example + + + "#, + rand_words(100) + ), + "https://a.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Website B + + + {CONTENT} {} + example example example + + + "#, + rand_words(100) + ), + "https://b.com/", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + + index.commit().expect("failed to commit index"); + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse( + r#" + DiscardNonMatching; + Rule { + Matches { + Site("b.com") + } + }; + "#, + ) + .unwrap(), + ), + site_rankings: Some(SiteRankings { + liked: vec!["a.com".to_string()], + disliked: vec![], + blocked: vec![], + }), + ..Default::default() + }) + .unwrap() + .webpages; + + assert_eq!(res.len(), 1); + assert_eq!(res[0].url, "https://b.com/"); +} + +#[test] +fn discussion_optic() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + include_str!("../testcases/schema_org/infinity_war.html"), + "https://a.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index.commit().expect("failed to commit index"); + + let searcher = LocalSearcher::from(index); + let res = searcher + .search(&SearchQuery { + query: "avengers endgame".to_string(), + ..Default::default() + }) + .unwrap() + .webpages; + + assert!(!res.is_empty()); + assert_eq!(&res[0].url, "https://a.com/"); + + let res = searcher + .search(&SearchQuery { + query: "avengers endgame".to_string(), + optic: Some(Optic::parse(include_str!("../src/searcher/discussions.optic")).unwrap()), + ..Default::default() + }) + .unwrap() + .webpages; + assert!(res.is_empty()); +} + +#[test] +fn special_pattern_syntax() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + This is an example website + + + {CONTENT} {} + This is an example + + + "#, + rand_words(1000) + ), + "https://example.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index.commit().expect("failed to commit index"); + + let searcher = LocalSearcher::from(index); + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + assert_eq!(res[0].url, "https://example.com/"); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"is\") }, Action(Discard) }").unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 0); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"|is\") }, Action(Discard) }").unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"|This\") }, Action(Discard) }").unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 0); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"|This an\") }, Action(Discard) }").unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"|This * an\") }, Action(Discard) }") + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 0); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Site(\"example.com\") }, Action(Discard) }") + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 0); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Site(\"|example.com\") }, Action(Discard) }") + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 0); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Site(\"|example.com|\") }, Action(Discard) }") + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 0); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"website.com|\") }, Action(Discard) }") + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); +} + +#[test] +fn active_optic_with_blocked_sites() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + This is an example website + + + {CONTENT} {} + This is an example + + + "#, + rand_words(1000) + ), + "https://example.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index.commit().expect("failed to commit index"); + + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse( + "DiscardNonMatching; Rule { Matches { Title(\"is\") }, Action(Boost(0)) }", + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse( + "DiscardNonMatching; Rule { Matches { Title(\"is\") }, Action(Boost(0)) }", + ) + .unwrap(), + ), + site_rankings: Some(SiteRankings { + liked: vec![], + disliked: vec![], + blocked: vec![String::from("example.com")], + }), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 0); +} + +#[test] +fn empty_optic_noop() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + This is an example website + + + {CONTENT} {} + This is an example + + + "#, + rand_words(1000) + ), + "https://example.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index.commit().expect("failed to commit index"); + + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some(Optic::parse("").unwrap()), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some(Optic::parse("Rule { Matches { Title(\"\") }, Action(Discard) }").unwrap()), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); +} + +#[test] +fn wildcard_edge_cases() { + let mut index = temporary_index().expect("Unable to open index"); + + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + This is an example website + + + {CONTENT} {} + This is an example + + + "#, + rand_words(1000) + ), + "https://example.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index + .insert(Webpage { + html: Html::parse( + &format!( + r#" + + + Another thing with no words in common + + + {CONTENT} {} + This is an example + + + "#, + rand_words(1000) + ), + "https://example.com", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }) + .expect("failed to insert webpage"); + index.commit().expect("failed to commit index"); + + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"*\") }, Action(Discard) }").unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 0); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"* is\") }, Action(Discard) }").unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"* This is\") }, Action(Discard) }").unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"example *\") }, Action(Discard) }").unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("Rule { Matches { Title(\"example website *\") }, Action(Discard) }") + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); +} + +#[test] +fn empty_double_anchor() { + let mut index = temporary_index().expect("Unable to open index"); + + let mut page = Webpage { + html: Html::parse( + r#" + + + This is an example website + + + Test + + + "#, + "https://example.com/", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }; + + page.html.set_clean_text("".to_string()); + + index.insert(page).expect("failed to insert webpage"); + index.commit().expect("failed to commit index"); + + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse( + "DiscardNonMatching; Rule { Matches { Content(\"||\") }, Action(Boost(0)) }", + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse( + "DiscardNonMatching; Rule { Matches { Content(\"|\") }, Action(Boost(0)) }", + ) + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); +} + +#[test] +fn indieweb_search() { + let mut index = temporary_index().expect("Unable to open index"); + + let mut page = Webpage { + html: Html::parse( + r#" + + + This is an example indie website + + +
+

Microformats are amazing

+

This is the content of the article

+ Permalink + Author + +
+ + + "#, + "https://example.com/", + ).unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }; + + page.html.set_clean_text("".to_string()); + + index.insert(page).expect("failed to insert webpage"); + + let mut page = Webpage { + html: Html::parse( + r#" + + + This is an example non-indie website + + + example example example + + + "#, + "https://non-indie-example.com/", + ) + .unwrap(), + backlink_labels: vec![], + host_centrality: 0.0, + page_centrality: 0.0, + fetch_time_ms: 500, + pre_computed_score: 0.0, + + node_id: None, + dmoz_description: None, + safety_classification: None, + }; + + page.html.set_clean_text("".to_string()); + + index.insert(page).expect("failed to insert webpage"); + index.commit().expect("failed to commit index"); + + let searcher = LocalSearcher::from(index); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 2); + + let res = searcher + .search(&SearchQuery { + query: "example".to_string(), + optic: Some( + Optic::parse("DiscardNonMatching; Rule { Matches { MicroformatTag(\"|h-*\") } }") + .unwrap(), + ), + ..Default::default() + }) + .unwrap() + .webpages; + assert_eq!(res.len(), 1); + assert_eq!(res[0].domain, "example.com"); +} diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml new file mode 100644 index 000000000..5357945e2 --- /dev/null +++ b/crates/crawler/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "crawler" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +async-trait.workspace = true +bincode.workspace = true +chrono.workspace = true +distributed.workspace = true +encoding_rs.workspace = true +futures.workspace = true +hashbrown.workspace = true +hyperloglog.workspace = true +kv.workspace = true +memmap2.workspace = true +mime.workspace = true +proptest.workspace = true +quick-xml.workspace = true +rand.workspace = true +rayon.workspace = true +reqwest.workspace = true +rkyv.workspace = true +robotstxt-with-cache.workspace = true +rust-s3.workspace = true +serde.workspace = true +serde_json.workspace = true +sonic.workspace = true +stdx.workspace = true +stract-config.workspace = true +thiserror.workspace = true +tokio.workspace = true +tokio-stream.workspace = true +tracing.workspace = true +url.workspace = true +uuid.workspace = true +warc.workspace = true +webgraph.workspace = true +webpage.workspace = true diff --git a/core/src/crawler/coordinator.rs b/crates/crawler/src/coordinator.rs similarity index 62% rename from core/src/crawler/coordinator.rs rename to crates/crawler/src/coordinator.rs index cc4f20e39..77074353d 100644 --- a/core/src/crawler/coordinator.rs +++ b/crates/crawler/src/coordinator.rs @@ -14,15 +14,21 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use serde::{Deserialize, Serialize}; +use sonic::{service::Message, sonic_service}; + use super::{file_queue::FileQueue, Job, Result}; -use std::{path::Path, sync::Mutex}; +use std::{ + path::Path, + sync::{Arc, Mutex}, +}; pub struct CrawlCoordinator { jobs: Mutex>, } impl CrawlCoordinator { - pub fn new>(jobs_queue: P) -> Result { + pub fn new(jobs_queue: &Path) -> Result { Ok(Self { jobs: Mutex::new(FileQueue::new(jobs_queue)?), }) @@ -32,3 +38,22 @@ impl CrawlCoordinator { self.jobs.lock().unwrap_or_else(|e| e.into_inner()).pop() } } + +pub struct CoordinatorService { + pub coordinator: Arc, +} + +sonic_service!(CoordinatorService, [GetJob]); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GetJob {} + +#[async_trait::async_trait] +impl Message for GetJob { + type Response = Option; + + async fn handle(self, server: &CoordinatorService) -> sonic::Result { + let job = server.coordinator.sample_job()?; + Ok(job) + } +} diff --git a/core/src/crawler/file_queue.rs b/crates/crawler/src/file_queue.rs similarity index 85% rename from core/src/crawler/file_queue.rs rename to crates/crawler/src/file_queue.rs index 92883dae1..87bfe3770 100644 --- a/core/src/crawler/file_queue.rs +++ b/crates/crawler/src/file_queue.rs @@ -40,19 +40,19 @@ impl FileQueueWriter where T: serde::Serialize + serde::de::DeserializeOwned, { - pub fn new>(path: P) -> Result { - if !path.as_ref().exists() { - std::fs::create_dir_all(path.as_ref())?; + pub fn new(path: &std::path::Path) -> Result { + if !path.exists() { + std::fs::create_dir_all(path)?; } let file = std::fs::OpenOptions::new() .create(true) .write(true) .read(true) - .open(path.as_ref().join(DATA_KEY))?; + .open(path.join(DATA_KEY))?; Ok(Self { - path: path.as_ref().to_path_buf(), + path: path.to_path_buf(), writer: BufWriter::new(file), _marker: std::marker::PhantomData, }) @@ -84,7 +84,7 @@ where let file = self.writer.into_inner()?; Ok(FileQueue { - pointer: FilePointer::new(self.path)?, + pointer: FilePointer::new(&self.path)?, file: unsafe { Mmap::map(&file)? }, _marker: std::marker::PhantomData, }) @@ -96,16 +96,16 @@ struct FilePointer { } impl FilePointer { - fn new>(path: P) -> Result { - if !path.as_ref().exists() { - std::fs::create_dir_all(path.as_ref())?; + fn new(path: &std::path::Path) -> Result { + if !path.exists() { + std::fs::create_dir_all(path)?; } let file = std::fs::OpenOptions::new() .create(true) .write(true) .read(true) - .open(path.as_ref().join(POINTER_KEY))?; + .open(path.join(POINTER_KEY))?; Ok(Self { file }) } @@ -140,12 +140,12 @@ impl FileQueue where T: serde::Serialize + serde::de::DeserializeOwned, { - pub fn new>(path: P) -> Result { - if !path.as_ref().exists() { - std::fs::create_dir_all(path.as_ref())?; + pub fn new(path: &std::path::Path) -> Result { + if !path.exists() { + std::fs::create_dir_all(path)?; } - let file = File::open(path.as_ref().join(DATA_KEY))?; + let file = File::open(path.join(DATA_KEY))?; let file = unsafe { Mmap::map(&file)? }; Ok(Self { @@ -184,7 +184,7 @@ mod tests { #[test] fn simple() { - let mut writer = FileQueueWriter::new(crate::gen_temp_path()).unwrap(); + let mut writer = FileQueueWriter::new(&stdx::gen_temp_path()).unwrap(); writer.push("Hello".to_string()).unwrap(); writer.push("World".to_string()).unwrap(); @@ -201,7 +201,7 @@ mod tests { fn prop(data: Vec) { let expected = data.clone(); - let mut writer = FileQueueWriter::new(crate::gen_temp_path()).unwrap(); + let mut writer = FileQueueWriter::new(&stdx::gen_temp_path()).unwrap(); for item in data { writer.push(item).unwrap(); diff --git a/core/src/crawler/mod.rs b/crates/crawler/src/lib.rs similarity index 98% rename from core/src/crawler/mod.rs rename to crates/crawler/src/lib.rs index aae8416bd..ef85e1b5f 100644 --- a/core/src/crawler/mod.rs +++ b/crates/crawler/src/lib.rs @@ -18,18 +18,17 @@ use std::{collections::VecDeque, sync::Arc, time::Duration}; use hashbrown::HashMap; +use stract_config::CrawlerConfig; use url::Url; - -use crate::{config::CrawlerConfig, warc, webpage::url_ext::UrlExt}; +use webpage::url_ext::UrlExt; use self::{warc_writer::WarcWriter, worker::WorkerThread}; pub mod coordinator; -mod robots_txt; -pub mod router; -pub use router::Router; mod file_queue; pub mod planner; +mod robots_txt; +pub mod router; mod site_graph; mod warc_writer; mod worker; diff --git a/core/src/crawler/planner.rs b/crates/crawler/src/planner.rs similarity index 92% rename from core/src/crawler/planner.rs rename to crates/crawler/src/planner.rs index 5feff7861..1c4eac9a4 100644 --- a/core/src/crawler/planner.rs +++ b/crates/crawler/src/planner.rs @@ -14,6 +14,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . use anyhow::{anyhow, Result}; +use kv::{rocksdb_store::RocksDbStore, Kv}; use rayon::prelude::*; use std::collections::HashMap; use std::{ @@ -21,17 +22,14 @@ use std::{ path::Path, sync::{atomic::AtomicUsize, Mutex}, }; +use stract_config::CrawlPlannerConfig; use url::Url; - -use crate::webgraph::centrality::{top_hosts, TopHosts}; -use crate::{ - config::CrawlPlannerConfig, - crawler::{file_queue::FileQueueWriter, Job}, - kv::{rocksdb_store::RocksDbStore, Kv}, - webgraph::{NodeID, Webgraph}, +use webgraph::{ + centrality::{top_hosts, TopHosts}, + NodeID, Webgraph, }; -use super::Domain; +use super::{file_queue::FileQueueWriter, Domain, Job}; fn all_pages( page_centrality: &RocksDbStore, @@ -76,21 +74,21 @@ fn check_config(config: &CrawlPlannerConfig) -> Result<()> { Ok(()) } -pub fn make_crawl_plan>( +pub fn make_crawl_plan( host_centrality: RocksDbStore, page_centrality: RocksDbStore, host_graph: Webgraph, page_graph: Webgraph, config: CrawlPlannerConfig, - output: P, + output: &Path, ) -> Result<()> { check_config(&config)?; - if output.as_ref().exists() { + if output.exists() { return Err(anyhow!("output path already exists")); } - let queue_path = output.as_ref().join("job_queue"); + let queue_path = output.join("job_queue"); std::fs::create_dir_all(&queue_path)?; let hosts = top_hosts( @@ -103,7 +101,7 @@ pub fn make_crawl_plan>( let job_queues: Vec>> = (0..config.num_job_queues) .map(|i| { let path = queue_path.join(format!("{}.queue", i)); - FileQueueWriter::new(path) + FileQueueWriter::new(&path) }) .collect::>>()? .into_iter() @@ -188,7 +186,7 @@ pub fn make_crawl_plan>( stats: stats.into_inner().unwrap_or_else(|e| e.into_inner()), }; - let metadata_path = output.as_ref().join("metadata.json"); + let metadata_path = output.join("metadata.json"); let metadata_file = std::fs::File::create(metadata_path)?; serde_json::to_writer_pretty(metadata_file, &metadata)?; diff --git a/core/src/crawler/robots_txt.rs b/crates/crawler/src/robots_txt.rs similarity index 100% rename from core/src/crawler/robots_txt.rs rename to crates/crawler/src/robots_txt.rs diff --git a/core/src/crawler/router.rs b/crates/crawler/src/router.rs similarity index 75% rename from core/src/crawler/router.rs rename to crates/crawler/src/router.rs index fa4fcec2e..f2bae9584 100644 --- a/core/src/crawler/router.rs +++ b/crates/crawler/src/router.rs @@ -1,15 +1,33 @@ use anyhow::Result; +use distributed::retry_strategy::ExponentialBackoff; use rand::Rng; +use serde::{Deserialize, Serialize}; +use sonic::{service::Message, sonic_service}; use std::{net::SocketAddr, time::Duration}; use tokio::sync::Mutex; -use crate::{ - distributed::{retry_strategy::ExponentialBackoff, sonic}, - entrypoint::crawler::coordinator::{CoordinatorService, GetJob}, +use super::{ + coordinator::{CoordinatorService, GetJob}, + Job, }; -use super::Job; +pub struct RouterService { + pub router: Router, +} + +sonic_service!(RouterService, [NewJob]); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NewJob {} +#[async_trait::async_trait] +impl Message for NewJob { + type Response = Option; + + async fn handle(self, server: &RouterService) -> sonic::Result { + Ok(server.router.sample_job().await?) + } +} struct RemoteCoordinator { addr: SocketAddr, } diff --git a/core/src/crawler/site_graph.rs b/crates/crawler/src/site_graph.rs similarity index 98% rename from core/src/crawler/site_graph.rs rename to crates/crawler/src/site_graph.rs index b13a1cffb..61fc25761 100644 --- a/core/src/crawler/site_graph.rs +++ b/crates/crawler/src/site_graph.rs @@ -17,9 +17,9 @@ //! In-memory graph that the worker constructs for the site during crawl. use hashbrown::{HashMap, HashSet}; +use hyperloglog::HyperLogLog; use url::Url; - -use crate::{hyperloglog::HyperLogLog, kahan_sum::KahanSum}; +use webgraph::kahan_sum::KahanSum; #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub struct Node { diff --git a/core/src/crawler/warc_writer.rs b/crates/crawler/src/warc_writer.rs similarity index 97% rename from core/src/crawler/warc_writer.rs rename to crates/crawler/src/warc_writer.rs index a2b06e541..f5b0a1ee5 100644 --- a/core/src/crawler/warc_writer.rs +++ b/crates/crawler/src/warc_writer.rs @@ -15,11 +15,7 @@ // along with this program. If not, see . use std::time::Duration; - -use crate::{ - config::{self, S3Config}, - warc, -}; +use stract_config::S3Config; use super::{CrawlDatum, Result}; @@ -35,7 +31,7 @@ pub enum WarcWriterMessage { Finish, } -async fn commit(writer: warc::WarcWriter, s3: config::S3Config) { +async fn commit(writer: warc::WarcWriter, s3: stract_config::S3Config) { let filename = format!( "{}_{}.warc.gz", chrono::Utc::now().to_rfc3339(), diff --git a/core/src/crawler/worker.rs b/crates/crawler/src/worker.rs similarity index 99% rename from core/src/crawler/worker.rs rename to crates/crawler/src/worker.rs index 0e67ad9cd..fe3f7cf42 100644 --- a/core/src/crawler/worker.rs +++ b/crates/crawler/src/worker.rs @@ -13,6 +13,7 @@ // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use distributed::retry_strategy::ExponentialBackoff; use encoding_rs::{Encoding, UTF_8}; use futures::{future::BoxFuture, FutureExt}; use hashbrown::{HashMap, HashSet}; @@ -28,15 +29,13 @@ use std::{ time::{Duration, Instant}, }; +use stract_config::CrawlerConfig; use url::Url; +use webpage::Html; -use crate::{ - config::CrawlerConfig, - crawler::MAX_URL_LEN_BYTES, - distributed::{retry_strategy::ExponentialBackoff, sonic}, - entrypoint::crawler::router::{NewJob, RouterService}, - warc, - webpage::Html, +use super::{ + router::{NewJob, RouterService}, + MAX_URL_LEN_BYTES, }; use super::{ diff --git a/crates/distributed/Cargo.toml b/crates/distributed/Cargo.toml new file mode 100644 index 000000000..77246c7e5 --- /dev/null +++ b/crates/distributed/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "distributed" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +chitchat.workspace = true +serde = { workspace = true, features = ["derive"] } +serde_json.workspace = true +tokio.workspace = true +tokio-stream = "0.1.14" +tracing.workspace = true +uuid.workspace = true diff --git a/core/src/distributed/cluster.rs b/crates/distributed/src/cluster.rs similarity index 98% rename from core/src/distributed/cluster.rs rename to crates/distributed/src/cluster.rs index c5df507a9..24ca9b72a 100644 --- a/core/src/distributed/cluster.rs +++ b/crates/distributed/src/cluster.rs @@ -22,7 +22,7 @@ use tokio::sync::RwLock; use tokio_stream::StreamExt; use tracing::error; -use crate::distributed::member::{Member, Service}; +use crate::member::{Member, Service}; const CLUSTER_ID: &str = "stract-cluster"; const GOSSIP_INTERVAL: Duration = Duration::from_secs(1); diff --git a/core/src/distributed/mod.rs b/crates/distributed/src/lib.rs similarity index 98% rename from core/src/distributed/mod.rs rename to crates/distributed/src/lib.rs index 9d64573fa..04b1a33b1 100644 --- a/core/src/distributed/mod.rs +++ b/crates/distributed/src/lib.rs @@ -16,4 +16,3 @@ pub mod cluster; pub mod member; pub mod retry_strategy; -pub mod sonic; diff --git a/core/src/distributed/member.rs b/crates/distributed/src/member.rs similarity index 86% rename from core/src/distributed/member.rs rename to crates/distributed/src/member.rs index d38c75f83..1b86c8e2c 100644 --- a/core/src/distributed/member.rs +++ b/crates/distributed/src/member.rs @@ -18,7 +18,14 @@ use std::net::SocketAddr; use serde::{Deserialize, Serialize}; -use crate::searcher::ShardId; +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct ShardId(u64); + +impl ShardId { + pub fn new(id: u64) -> ShardId { + ShardId(id) + } +} #[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] pub enum Service { diff --git a/core/src/distributed/retry_strategy.rs b/crates/distributed/src/retry_strategy.rs similarity index 100% rename from core/src/distributed/retry_strategy.rs rename to crates/distributed/src/retry_strategy.rs diff --git a/crates/entity_index/Cargo.toml b/crates/entity_index/Cargo.toml new file mode 100644 index 000000000..d0fbbdd0e --- /dev/null +++ b/crates/entity_index/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "entity_index" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +base64.workspace = true +bincode.workspace = true +bzip2.workspace = true +imager.workspace = true +itertools.workspace = true +kv.workspace = true +md5.workspace = true +parse_wiki_text.workspace = true +quick-xml = "0.30.0" +serde.workspace = true +stdx.workspace = true +tantivy.workspace = true +thiserror.workspace = true +tokenizer.workspace = true +tracing.workspace = true +url.workspace = true +utoipa.workspace = true + +[dev-dependencies] +insta.workspace = true diff --git a/core/src/entrypoint/entity.rs b/crates/entity_index/src/builder.rs similarity index 98% rename from core/src/entrypoint/entity.rs rename to crates/entity_index/src/builder.rs index 14a8267a1..d47ee1134 100644 --- a/core/src/entrypoint/entity.rs +++ b/crates/entity_index/src/builder.rs @@ -14,19 +14,18 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use crate::{ + entity::{Entity, Paragraph, Span, WikiNodeExt}, + EntityIndex, +}; use std::{ collections::{BTreeMap, HashSet}, fs::File, io::{BufRead, BufReader}, + path::Path, }; -use crate::{ - entity_index::{ - entity::{Entity, Paragraph, Span, WikiNodeExt}, - EntityIndex, - }, - Result, -}; +use crate::Result; use bzip2::bufread::MultiBzDecoder; use itertools::Itertools; @@ -99,7 +98,7 @@ impl Iterator for EntityIterator { pub struct EntityIndexer; impl EntityIndexer { - pub fn run(wikipedia_dump_path: String, output_path: String) -> Result<()> { + pub fn run(wikipedia_dump_path: &Path, output_path: &Path) -> Result<()> { let reader = BufReader::new(File::open(wikipedia_dump_path)?); let reader = BufReader::new(MultiBzDecoder::new(reader)); let mut index = EntityIndex::open(output_path)?; @@ -291,7 +290,7 @@ impl EntityBuilder { #[cfg(test)] mod tests { - use crate::entity_index::entity::EntitySnippet; + use crate::entity::EntitySnippet; use super::*; @@ -369,7 +368,7 @@ mod tests { fn aristotle() { check_abstract( "Aristotle", - include_str!("../../testcases/entity/aristotle.txt"), + include_str!("../../core/testcases/entity/aristotle.txt"), expect!(@r###" Title: Aristotle Image: Some("Aristotle Altemps Inv8575.jpg") @@ -437,7 +436,7 @@ mod tests { fn barack_obama() { check_abstract( "Barack Obama", - include_str!("../../testcases/entity/obama.txt"), + include_str!("../../core/testcases/entity/obama.txt"), expect!(@r###" Title: Barack Obama Image: Some("President Barack Obama.jpg") @@ -500,7 +499,7 @@ mod tests { fn algorithm() { check_abstract( "Algorithm", - include_str!("../../testcases/entity/algorithm.txt"), + include_str!("../../core/testcases/entity/algorithm.txt"), expect!(@r###" Title: Algorithm Image: None @@ -521,7 +520,7 @@ mod tests { fn andre() { check_abstract( "Andre", - include_str!("../../testcases/entity/andre.txt"), + include_str!("../../core/testcases/entity/andre.txt"), expect!(@r###" Title: Andre Image: Some("Andre Agassi (2011).jpg") @@ -581,7 +580,7 @@ mod tests { fn skip_disambiguation_pages() { assert!(EntityBuilder { title: "Test".to_string(), - text: include_str!("../../testcases/entity/disambiguation.txt").to_string(), + text: include_str!("../../core/testcases/entity/disambiguation.txt").to_string(), } .build() .is_none()); diff --git a/core/src/entity_index/entity.rs b/crates/entity_index/src/entity.rs similarity index 87% rename from core/src/entity_index/entity.rs rename to crates/entity_index/src/entity.rs index 8ff853129..d6ee5ed6a 100644 --- a/core/src/entity_index/entity.rs +++ b/crates/entity_index/src/entity.rs @@ -245,3 +245,46 @@ impl<'a> From<&[Node<'a>]> for Span { span } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_link_to_html() { + assert_eq!( + EntitySnippet::from_span( + &Span { + text: "some text with a link".to_string(), + links: vec![Link { + start: 5, + end: 9, + target: "text article".to_string() + }] + }, + 10000 + ) + .to_md(None), + "some [text](https://en.wikipedia.org/wiki/text_article) with a link".to_string() + ); + } + + #[test] + fn truncated_link_to_html() { + assert_eq!( + EntitySnippet::from_span( + &Span { + text: "some text".to_string(), + links: vec![Link { + start: 5, + end: 9, + target: "text article".to_string() + }] + }, + 7 + ) + .to_md(None), + "some [te](https://en.wikipedia.org/wiki/text_article)...".to_string() + ); + } +} diff --git a/core/src/entity_index/mod.rs b/crates/entity_index/src/lib.rs similarity index 94% rename from core/src/entity_index/mod.rs rename to crates/entity_index/src/lib.rs index 138ff8157..6a88cbd80 100644 --- a/core/src/entity_index/mod.rs +++ b/crates/entity_index/src/lib.rs @@ -23,6 +23,11 @@ use std::{ }; use base64::{prelude::BASE64_STANDARD as BASE64_ENGINE, Engine}; +use imager::{ + image_downloader::{ImageDownloadJob, ImageDownloader}, + image_store::{EntityImageStore, Image, ImageStore}, +}; +use kv::{rocksdb_store::RocksDbStore, Kv}; use serde::{Deserialize, Serialize}; use tantivy::{ collector::TopDocs, @@ -31,19 +36,23 @@ use tantivy::{ tokenizer::Tokenizer, DocAddress, IndexReader, IndexWriter, Searcher, TantivyDocument, Term, }; +use tokenizer::Normal; use tracing::info; use url::Url; -use crate::{ - image_downloader::{ImageDownloadJob, ImageDownloader}, - image_store::{EntityImageStore, Image, ImageStore}, - kv::{rocksdb_store::RocksDbStore, Kv}, - tokenizer::Normal, - Result, -}; - use self::entity::{Entity, Link, Span}; -pub(crate) mod entity; +pub mod builder; +pub mod entity; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("io error")] + Io(#[from] std::io::Error), + #[error("tantivy error")] + Tantivy(#[from] tantivy::error::TantivyError), +} + +pub type Result = std::result::Result; pub struct EntityIndex { image_store: EntityImageStore, @@ -163,13 +172,13 @@ pub struct EntityMatch { } impl EntityIndex { - pub fn open>(path: P) -> Result { - if !path.as_ref().exists() { - fs::create_dir_all(path.as_ref())?; + pub fn open(path: &Path) -> Result { + if !path.exists() { + fs::create_dir_all(path)?; } let schema = schema(); - let tv_path = path.as_ref().join("inverted_index"); + let tv_path = path.join("inverted_index"); let tantivy_index = if tv_path.exists() { tantivy::Index::open_in_dir(&tv_path)? } else { @@ -177,11 +186,10 @@ impl EntityIndex { tantivy::Index::create_in_dir(&tv_path, schema.clone())? }; - let attribute_occurrences = Box::new(RocksDbStore::open( - path.as_ref().join("attribute_occurrences"), - )); + let attribute_occurrences = + Box::new(RocksDbStore::open(&path.join("attribute_occurrences"))); - let stopwords: HashSet = include_str!("../../stopwords/English.txt") + let stopwords: HashSet = include_str!("../../core/stopwords/English.txt") .lines() .take(50) .map(str::to_ascii_lowercase) @@ -192,7 +200,7 @@ impl EntityIndex { Normal::with_stopwords(stopwords.clone().into_iter().collect()), ); - let image_store = EntityImageStore::open(path.as_ref().join("images")); + let image_store = EntityImageStore::open(&path.join("images")); let writer = tantivy_index.writer(10_000_000_000)?; let reader = tantivy_index.reader()?; @@ -486,7 +494,7 @@ mod tests { #[test] fn stopwords_title_ignored() { - let mut index = EntityIndex::open(crate::gen_temp_path()).unwrap(); + let mut index = EntityIndex::open(&stdx::gen_temp_path()).unwrap(); index.insert(Entity { title: "the ashes".to_string(), diff --git a/crates/executor/Cargo.toml b/crates/executor/Cargo.toml new file mode 100644 index 000000000..8d7d5c780 --- /dev/null +++ b/crates/executor/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "executor" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +crossbeam-channel.workspace = true +num_cpus.workspace = true +rayon.workspace = true +thiserror.workspace = true +tracing.workspace = true diff --git a/core/src/executor.rs b/crates/executor/src/lib.rs similarity index 93% rename from core/src/executor.rs rename to crates/executor/src/lib.rs index 597ae0697..03cad4fd1 100644 --- a/core/src/executor.rs +++ b/crates/executor/src/lib.rs @@ -14,11 +14,19 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::{Error, Result}; - use crossbeam_channel::unbounded; use rayon::{ThreadPool, ThreadPoolBuilder}; +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("At least one of the scheduled jobs failed")] + AtLeastOneOfTheScheduledJobsFailed, + #[error("Rayon thread pool error")] + Rayon(#[from] rayon::ThreadPoolBuildError), +} + +pub type Result = std::result::Result; + pub enum Executor { #[allow(unused)] SingleThread, @@ -88,10 +96,7 @@ impl Executor { let results: Vec = result_placeholders.into_iter().flatten().collect(); if results.len() != num_jobs { - return Err(Error::InternalError( - "At least one of the scheduled jobs failed.".to_string(), - ) - .into()); + return Err(Error::AtLeastOneOfTheScheduledJobsFailed); } Ok(results) diff --git a/crates/hyperloglog/Cargo.toml b/crates/hyperloglog/Cargo.toml new file mode 100644 index 000000000..f3e73815f --- /dev/null +++ b/crates/hyperloglog/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "hyperloglog" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde = { workspace = true, features = ["derive"] } diff --git a/core/src/hyperloglog.rs b/crates/hyperloglog/src/lib.rs similarity index 100% rename from core/src/hyperloglog.rs rename to crates/hyperloglog/src/lib.rs diff --git a/crates/imager/Cargo.toml b/crates/imager/Cargo.toml new file mode 100644 index 000000000..3c8e9aef6 --- /dev/null +++ b/crates/imager/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "imager" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bincode.workspace = true +distributed.workspace = true +futures.workspace = true +image.workspace = true +kv.workspace = true +reqwest.workspace = true +serde.workspace = true +stdx.workspace = true +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true +url.workspace = true diff --git a/core/src/image_downloader.rs b/crates/imager/src/image_downloader.rs similarity index 97% rename from core/src/image_downloader.rs rename to crates/imager/src/image_downloader.rs index b187ecefe..dbcdbf0d9 100644 --- a/core/src/image_downloader.rs +++ b/crates/imager/src/image_downloader.rs @@ -16,13 +16,11 @@ use futures::StreamExt; use std::{collections::HashSet, hash::Hash, time::Duration}; +use distributed::retry_strategy::ExponentialBackoff; use serde::Serialize; use url::Url; -use crate::{ - distributed::retry_strategy::ExponentialBackoff, - image_store::{Image, ImageStore}, -}; +use crate::image_store::{Image, ImageStore}; #[derive(Clone, Debug)] pub struct ImageDownloadJob diff --git a/core/src/image_store.rs b/crates/imager/src/image_store.rs similarity index 90% rename from core/src/image_store.rs rename to crates/imager/src/image_store.rs index f36e273bf..f0adc79e9 100644 --- a/core/src/image_store.rs +++ b/crates/imager/src/image_store.rs @@ -14,14 +14,15 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::kv::{rocksdb_store::RocksDbStore, Kv}; -use crate::Result; use image::imageops::FilterType; use image::{DynamicImage, ImageOutputFormat}; +use kv::{rocksdb_store::RocksDbStore, Kv}; use serde::{de, ser::SerializeStruct, Serialize}; use std::io::{Cursor, Read, Seek}; use std::path::Path; +use crate::{Error, Result}; + #[derive(PartialEq, Debug, Clone)] pub struct Image(DynamicImage); @@ -87,11 +88,11 @@ struct BaseImageStore { impl BaseImageStore { #[cfg(test)] - fn open>(path: P) -> Self { + fn open(path: &Path) -> Self { Self::open_with_filters(path, Vec::new()) } - fn open_with_filters>(path: P, filters: Vec>) -> Self { + fn open_with_filters(path: &Path, filters: Vec>) -> Self { let store = Box::new(RocksDbStore::open(path)); Self { store, filters } @@ -144,7 +145,7 @@ pub struct EntityImageStore { } impl EntityImageStore { - pub fn open>(path: P) -> Self { + pub fn open(path: &Path) -> Self { let store = BaseImageStore::open_with_filters( path, vec![Box::new(ResizeFilter { @@ -184,14 +185,18 @@ impl Image { { Ok(Self(img)) } else { - Ok(Self(image::load_from_memory_with_format( - &bytes, - image::ImageFormat::Png, - )?)) + Ok(Self( + image::load_from_memory_with_format(&bytes, image::ImageFormat::Png).map_err( + |source| Error::InvalidImageFormat { + source, + format: image::ImageFormat::Png, + }, + )?, + )) } } - pub(crate) fn as_raw_bytes(&self) -> Vec { + pub fn as_raw_bytes(&self) -> Vec { let mut cursor = Cursor::new(Vec::new()); self.0 .write_to(&mut cursor, ImageOutputFormat::Png) @@ -233,7 +238,7 @@ mod tests { ImageBuffer::from_pixel(2, 2, image::Rgb::([u16::MAX, u16::MAX, u16::MAX])).into(), ); let key = "test".to_string(); - let mut store = BaseImageStore::open(crate::gen_temp_path()); + let mut store = BaseImageStore::open(&stdx::gen_temp_path()); assert_eq!(store.get(&key), None); store.insert(key.clone(), image.clone()); diff --git a/crates/imager/src/lib.rs b/crates/imager/src/lib.rs new file mode 100644 index 000000000..5ae5464d5 --- /dev/null +++ b/crates/imager/src/lib.rs @@ -0,0 +1,13 @@ +pub mod image_downloader; +pub mod image_store; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("image was not of format: {format:?}")] + InvalidImageFormat { + source: image::ImageError, + format: image::ImageFormat, + }, +} + +pub type Result = std::result::Result; diff --git a/crates/kv/Cargo.toml b/crates/kv/Cargo.toml new file mode 100644 index 000000000..96a85d510 --- /dev/null +++ b/crates/kv/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "kv" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bincode.workspace = true +rocksdb.workspace = true +serde.workspace = true diff --git a/core/src/kv/mod.rs b/crates/kv/src/lib.rs similarity index 100% rename from core/src/kv/mod.rs rename to crates/kv/src/lib.rs diff --git a/core/src/kv/rocksdb_store.rs b/crates/kv/src/rocksdb_store.rs similarity index 94% rename from core/src/kv/rocksdb_store.rs rename to crates/kv/src/rocksdb_store.rs index a825799ca..a3b123520 100644 --- a/core/src/kv/rocksdb_store.rs +++ b/crates/kv/src/rocksdb_store.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use std::{fs, marker::PhantomData}; +use std::{fs, marker::PhantomData, path::Path}; use rocksdb::{ BlockBasedOptions, DBIteratorWithThreadMode, DBWithThreadMode, IteratorMode, Options, @@ -22,7 +22,7 @@ use rocksdb::{ }; use serde::{de::DeserializeOwned, Serialize}; -use crate::kv::Kv; +use crate::Kv; pub struct RocksDbStore where @@ -39,12 +39,9 @@ where K: Serialize + DeserializeOwned, V: Serialize + DeserializeOwned, { - pub fn open

(path: P) -> Self - where - P: AsRef, - { - if !path.as_ref().exists() { - fs::create_dir_all(path.as_ref()).expect("faild to create dir"); + pub fn open(path: &Path) -> Self { + if !path.exists() { + fs::create_dir_all(path).expect("faild to create dir"); } let mut options = Options::default(); diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml new file mode 100644 index 000000000..15ac12e08 --- /dev/null +++ b/crates/llm/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "stract-llm" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bincode.workspace = true +byteorder.workspace = true +flate2.workspace = true +itertools.workspace = true +serde.workspace = true +stdx.workspace = true +tch.workspace = true +thiserror.workspace = true +tokenizers.workspace = true +tracing.workspace = true diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs new file mode 100644 index 000000000..fba26ec72 --- /dev/null +++ b/crates/llm/src/lib.rs @@ -0,0 +1,4 @@ +pub mod llm_utils; +pub mod qa_model; +pub mod summarizer; +pub mod word2vec; diff --git a/core/src/llm_utils.rs b/crates/llm/src/llm_utils.rs similarity index 100% rename from core/src/llm_utils.rs rename to crates/llm/src/llm_utils.rs diff --git a/core/src/qa_model.rs b/crates/llm/src/qa_model.rs similarity index 98% rename from core/src/qa_model.rs rename to crates/llm/src/qa_model.rs index f24da1f49..ecd7ca119 100644 --- a/core/src/qa_model.rs +++ b/crates/llm/src/qa_model.rs @@ -47,7 +47,7 @@ pub struct Answer { } impl QaModel { - pub fn open>(folder: P) -> Result { + pub fn open(folder: &Path) -> Result { let truncation = TruncationParams { max_length: TRUNCATE_INPUT, ..Default::default() @@ -56,12 +56,11 @@ impl QaModel { ..Default::default() }; - let mut tokenizer = - tokenizers::Tokenizer::from_file(folder.as_ref().join("tokenizer.json"))?; + let mut tokenizer = tokenizers::Tokenizer::from_file(folder.join("tokenizer.json"))?; tokenizer.with_truncation(Some(truncation))?; tokenizer.with_padding(Some(padding)); - let model = tch::CModule::load(folder.as_ref().join("model.pt"))?; + let model = tch::CModule::load(folder.join("model.pt"))?; Ok(Self { tokenizer, model, @@ -199,7 +198,7 @@ mod tests { fn open_qa_model() -> QaModel { QaModel::open( - std::path::Path::new("../data/qa_model") + &std::path::Path::new("../../data/qa_model") .canonicalize() .expect("QA model not found in data/qa_model"), ) diff --git a/core/src/summarizer.rs b/crates/llm/src/summarizer.rs similarity index 94% rename from core/src/summarizer.rs rename to crates/llm/src/summarizer.rs index f571a8f06..015508f6d 100644 --- a/core/src/summarizer.rs +++ b/crates/llm/src/summarizer.rs @@ -27,9 +27,8 @@ use tch::{IValue, Kind, Tensor}; use tokenizers::{PaddingParams, TruncationParams}; use crate::{ - ceil_char_boundary, llm_utils::{self, ClonableTensor}, - spell::word2vec::{Word2Vec, WordVec}, + word2vec::{Word2Vec, WordVec}, }; #[derive(thiserror::Error, Debug)] @@ -44,7 +43,7 @@ pub enum Error { Io(#[from] std::io::Error), #[error("Word2vec")] - Word2Vec(#[from] crate::spell::word2vec::Error), + Word2Vec(#[from] crate::word2vec::Error), #[error("Unexpected output type")] UnexpectedOutputType, @@ -154,7 +153,7 @@ impl<'a> Iterator for OverlappingSents<'a> { self.text = ""; self.prev_end += end; } else { - let next_start = ceil_char_boundary(self.text, next_start + 1); + let next_start = stdx::ceil_char_boundary(self.text, next_start + 1); self.text = &self.text[next_start..]; self.prev_end += next_start; @@ -231,7 +230,7 @@ pub struct ExtractiveSummarizer { } impl ExtractiveSummarizer { - pub fn open>(path: P, top_n_passages: usize) -> Result { + pub fn open(path: &Path, top_n_passages: usize) -> Result { Ok(Self { passage_scorer: DualEncoder::open(path)?, top_n_passages, @@ -288,7 +287,7 @@ impl ExtractiveSummarizer { for (a, mut b) in best_passages.into_iter().tuple_windows() { if a.range.end > b.range.start { - b.range.start = ceil_char_boundary(text, a.range.end); + b.range.start = stdx::ceil_char_boundary(text, a.range.end); b.passage = &text[b.range.clone()]; } @@ -323,16 +322,11 @@ pub struct Summarizer { } impl Summarizer { - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { Ok(Self { - extractive: ExtractiveSummarizer::open( - path.as_ref().join("dual_encoder").as_path(), - 50, - )?, + extractive: ExtractiveSummarizer::open(path.join("dual_encoder").as_path(), 50)?, abstractive: AbstractiveSummarizer { - model: Arc::new(AbstractiveModel::open( - path.as_ref().join("abstractive").as_path(), - )?), + model: Arc::new(AbstractiveModel::open(path.join("abstractive").as_path())?), }, }) } @@ -360,7 +354,7 @@ pub struct DualEncoder { } impl DualEncoder { - pub fn open>(folder: P) -> Result { + pub fn open(folder: &Path) -> Result { let truncation = TruncationParams { max_length: 256, ..Default::default() @@ -370,13 +364,12 @@ impl DualEncoder { ..Default::default() }; - let mut tokenizer = - tokenizers::Tokenizer::from_file(folder.as_ref().join("tokenizer.json"))?; + let mut tokenizer = tokenizers::Tokenizer::from_file(&folder.join("tokenizer.json"))?; tokenizer.with_truncation(Some(truncation))?; tokenizer.with_padding(Some(padding)); - let model = tch::CModule::load(folder.as_ref().join("model.pt"))?; + let model = tch::CModule::load(folder.join("model.pt"))?; Ok(Self { model, tokenizer }) } @@ -442,7 +435,7 @@ pub struct AbstractiveModel { } impl AbstractiveModel { - pub fn open>(folder: P) -> Result { + pub fn open(folder: &Path) -> Result { let truncation = TruncationParams { max_length: TRUNCATE_INPUT_ABSTRACTIVE, ..Default::default() @@ -452,15 +445,14 @@ impl AbstractiveModel { ..Default::default() }; - let mut tokenizer = - tokenizers::Tokenizer::from_file(folder.as_ref().join("tokenizer.json"))?; + let mut tokenizer = tokenizers::Tokenizer::from_file(&folder.join("tokenizer.json"))?; tokenizer.with_truncation(Some(truncation))?; tokenizer.with_padding(Some(padding)); - let encoder = tch::CModule::load(folder.as_ref().join("traced_encoder.pt"))?; - let decoder = tch::CModule::load(folder.as_ref().join("traced_decoder.pt"))?; - let decoder_with_past = tch::CModule::load(folder.as_ref().join("traced_decoder_wp.pt"))?; + let encoder = tch::CModule::load(folder.join("traced_encoder.pt"))?; + let decoder = tch::CModule::load(folder.join("traced_decoder.pt"))?; + let decoder_with_past = tch::CModule::load(folder.join("traced_decoder_wp.pt"))?; Ok(Self { tokenizer, @@ -850,7 +842,7 @@ mod tests { fn abstractive_summary() { let summarizer = AbstractiveSummarizer { model: Arc::new( - AbstractiveModel::open("../data/summarizer/abstractive") + AbstractiveModel::open("../../data/summarizer/abstractive".as_ref()) .expect("abstractive summary model not found"), ), }; diff --git a/core/src/spell/word2vec.rs b/crates/llm/src/word2vec.rs similarity index 98% rename from core/src/spell/word2vec.rs rename to crates/llm/src/word2vec.rs index 204047385..1976b2ace 100644 --- a/core/src/spell/word2vec.rs +++ b/crates/llm/src/word2vec.rs @@ -138,7 +138,7 @@ pub struct Word2Vec { } impl Word2Vec { - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { let reader = BufReader::new(MultiGzDecoder::new(BufReader::new(File::open(path)?))); let reader = WordVectorReader::new_from_reader(reader)?; let vectors: HashMap<_, _> = reader.map(|(word, vec)| (word, WordVec(vec))).collect(); diff --git a/crates/mapreduce/Cargo.toml b/crates/mapreduce/Cargo.toml new file mode 100644 index 000000000..851f82dc8 --- /dev/null +++ b/crates/mapreduce/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "mapreduce" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +async-trait = "0.1.74" +distributed.workspace = true +futures.workspace = true +itertools.workspace = true +serde.workspace = true +sonic.workspace = true +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true diff --git a/core/src/mapreduce/mod.rs b/crates/mapreduce/src/lib.rs similarity index 98% rename from core/src/mapreduce/mod.rs rename to crates/mapreduce/src/lib.rs index 9069c1edb..b0043c6fd 100644 --- a/core/src/mapreduce/mod.rs +++ b/crates/mapreduce/src/lib.rs @@ -27,8 +27,6 @@ use thiserror::Error; pub use worker::StatelessWorker; pub use worker::Worker; -use crate::distributed::sonic; - pub(crate) type Result = std::result::Result; #[derive(Error, Debug)] diff --git a/core/src/mapreduce/manager.rs b/crates/mapreduce/src/manager.rs similarity index 97% rename from core/src/mapreduce/manager.rs rename to crates/mapreduce/src/manager.rs index 2ebb33260..4595ca0cd 100644 --- a/core/src/mapreduce/manager.rs +++ b/crates/mapreduce/src/manager.rs @@ -1,7 +1,5 @@ -use super::{Error, MapReduceConnection, Result, Worker}; -use super::{Map, Reduce}; -use crate::distributed::retry_strategy::ExponentialBackoff; -use crate::mapreduce::Task; +use crate::{Error, Map, MapReduceConnection, Reduce, Result, Task, Worker}; +use distributed::retry_strategy::ExponentialBackoff; use futures::StreamExt; use itertools::Itertools; use serde::de::DeserializeOwned; diff --git a/core/src/mapreduce/worker.rs b/crates/mapreduce/src/worker.rs similarity index 96% rename from core/src/mapreduce/worker.rs rename to crates/mapreduce/src/worker.rs index 46fad548b..6827018f8 100644 --- a/core/src/mapreduce/worker.rs +++ b/crates/mapreduce/src/worker.rs @@ -16,9 +16,8 @@ use std::net::SocketAddr; -use crate::mapreduce::MapReduceServer; +use crate::{Map, MapReduceServer, Result, Task}; -use super::{Map, Result, Task}; use async_trait::async_trait; use serde::{de::DeserializeOwned, Serialize}; use tracing::{debug, info}; diff --git a/crates/naive_bayes/Cargo.toml b/crates/naive_bayes/Cargo.toml new file mode 100644 index 000000000..e662a418f --- /dev/null +++ b/crates/naive_bayes/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "naive_bayes" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +hashbrown.workspace = true +itertools.workspace = true +serde = { workspace = true, features = ["derive"] } +stdx.workspace = true diff --git a/core/src/naive_bayes.rs b/crates/naive_bayes/src/lib.rs similarity index 99% rename from core/src/naive_bayes.rs rename to crates/naive_bayes/src/lib.rs index 729019ba5..78c08b8cb 100644 --- a/core/src/naive_bayes.rs +++ b/crates/naive_bayes/src/lib.rs @@ -14,10 +14,10 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::intmap::IntMap; use hashbrown::{HashMap, HashSet}; use itertools::Itertools; use std::hash::Hash; +use stdx::intmap::IntMap; /// Laplace smoothing factor const ALPHA: f32 = 1.0; diff --git a/optics/Cargo.toml b/crates/optics/Cargo.toml similarity index 82% rename from optics/Cargo.toml rename to crates/optics/Cargo.toml index 3ecf105a9..41ef82274 100644 --- a/optics/Cargo.toml +++ b/crates/optics/Cargo.toml @@ -7,10 +7,9 @@ edition = "2021" thiserror = { workspace = true } lalrpop-util = { workspace = true } once_cell = { workspace = true } -itertools = { workspace = true } logos = { workspace = true } serde = { workspace = true } utoipa = { workspace = true } [build-dependencies] -lalrpop = { workspace = true } \ No newline at end of file +lalrpop = { workspace = true } diff --git a/core/build.rs b/crates/optics/build.rs similarity index 100% rename from core/build.rs rename to crates/optics/build.rs diff --git a/optics/src/ast.rs b/crates/optics/src/ast.rs similarity index 100% rename from optics/src/ast.rs rename to crates/optics/src/ast.rs diff --git a/optics/src/lexer.rs b/crates/optics/src/lexer.rs similarity index 100% rename from optics/src/lexer.rs rename to crates/optics/src/lexer.rs diff --git a/optics/src/lib.rs b/crates/optics/src/lib.rs similarity index 100% rename from optics/src/lib.rs rename to crates/optics/src/lib.rs diff --git a/optics/src/parser.lalrpop b/crates/optics/src/parser.lalrpop similarity index 100% rename from optics/src/parser.lalrpop rename to crates/optics/src/parser.lalrpop diff --git a/optics/testcases/crlf.optic b/crates/optics/testcases/crlf.optic similarity index 100% rename from optics/testcases/crlf.optic rename to crates/optics/testcases/crlf.optic diff --git a/optics/testcases/samples b/crates/optics/testcases/samples similarity index 100% rename from optics/testcases/samples rename to crates/optics/testcases/samples diff --git a/crates/query/Cargo.toml b/crates/query/Cargo.toml new file mode 100644 index 000000000..44eb86f1f --- /dev/null +++ b/crates/query/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "stract-query" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +itertools.workspace = true +min-max-heap.workspace = true +optics.workspace = true +proptest.workspace = true +schema.workspace = true +serde.workspace = true +serde_json.workspace = true +stdx.workspace = true +tantivy.workspace = true +url.workspace = true +urlencoding.workspace = true +utoipa.workspace = true diff --git a/core/src/bangs.rs b/crates/query/src/bangs.rs similarity index 93% rename from core/src/bangs.rs rename to crates/query/src/bangs.rs index 7ac34e7dc..c574029c8 100644 --- a/core/src/bangs.rs +++ b/crates/query/src/bangs.rs @@ -26,7 +26,7 @@ use serde::{Deserialize, Serialize}; use url::Url; use utoipa::ToSchema; -use crate::query::parser::Term; +use crate::parser::Term; pub const BANG_PREFIX: char = '!'; @@ -34,25 +34,25 @@ pub const BANG_PREFIX: char = '!'; #[serde(rename_all = "camelCase")] pub struct Bang { #[serde(rename = "c")] - pub(crate) category: Option, + pub category: Option, #[serde(rename = "sc")] - pub(crate) sub_category: Option, + pub sub_category: Option, #[serde(rename = "d")] - pub(crate) domain: Option, + pub domain: Option, #[serde(rename = "r")] - pub(crate) ranking: Option, + pub ranking: Option, #[serde(rename = "s")] - pub(crate) site: Option, + pub site: Option, #[serde(rename = "t")] - pub(crate) tag: String, + pub tag: String, #[serde(rename = "u")] - pub(crate) url: String, + pub url: String, } /// Wrapper around `Url` that implements `ToSchema` for `Url`. @@ -92,7 +92,7 @@ pub struct Bangs { } impl Bangs { - pub fn from_path>(path: P) -> Self { + pub fn from_path(path: &Path) -> Self { let json = fs::read_to_string(path).unwrap(); Self::from_json(json.as_str()) @@ -154,7 +154,7 @@ impl Bangs { #[cfg(test)] mod tests { - use crate::query::parser::parse; + use crate::parser::parse; use super::*; diff --git a/core/src/ranking/bm25.rs b/crates/query/src/bm25.rs similarity index 100% rename from core/src/ranking/bm25.rs rename to crates/query/src/bm25.rs diff --git a/core/src/query/const_query.rs b/crates/query/src/const_query.rs similarity index 100% rename from core/src/query/const_query.rs rename to crates/query/src/const_query.rs diff --git a/core/src/query/intersection.rs b/crates/query/src/intersection.rs similarity index 100% rename from core/src/query/intersection.rs rename to crates/query/src/intersection.rs diff --git a/crates/query/src/lib.rs b/crates/query/src/lib.rs new file mode 100644 index 000000000..182544470 --- /dev/null +++ b/crates/query/src/lib.rs @@ -0,0 +1,35 @@ +// Stract is an open source web search engine. +// Copyright (C) 2023 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +// use crate::{ +// inverted_index::InvertedIndex, query::parser::TermCompound, ranking::SignalCoefficient, +// search_ctx::Ctx, searcher::SearchQuery, Result, +// }; +// use optics::{Optic, SiteRankings}; +// use schema::{Field, TextField}; +// use std::collections::HashMap; +// use tantivy::query::{BooleanQuery, Occur, QueryClone, TermQuery}; +// use webpage::{region::Region, safety_classifier}; + +pub mod bangs; +pub mod bm25; +mod const_query; +pub mod intersection; +pub mod optic; +pub mod parser; +mod pattern_query; +pub mod shortcircuit; +pub mod union; diff --git a/crates/query/src/optic.rs b/crates/query/src/optic.rs new file mode 100644 index 000000000..9343e9406 --- /dev/null +++ b/crates/query/src/optic.rs @@ -0,0 +1,231 @@ +// Stract is an open source web search engine. +// Copyright (C) 2023 Stract ApS +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use itertools::Itertools; +use optics::{Action, MatchLocation, Matching, Optic, Rule}; +use schema::{fastfield_reader::FastFieldReader, TextField}; +use tantivy::{ + query::{BooleanQuery, Occur, QueryClone}, + schema::Schema, +}; + +use super::{const_query::ConstQuery, pattern_query::PatternQuery, union::UnionQuery}; + +pub trait AsTantivyQuery { + fn as_tantivy( + &self, + schema: &Schema, + fastfield_reader: &FastFieldReader, + ) -> Box; +} + +pub trait AsMultipleTantivyQuery { + fn as_multiple_tantivy( + &self, + schema: &Schema, + fastfield_reader: &FastFieldReader, + ) -> Vec<(Occur, Box)>; +} + +impl AsMultipleTantivyQuery for Optic { + fn as_multiple_tantivy( + &self, + schema: &Schema, + fastfields: &FastFieldReader, + ) -> Vec<(Occur, Box)> { + if self.discard_non_matching { + vec![( + Occur::Must, + UnionQuery::from( + self.rules + .iter() + .chain(self.site_rankings.rules().iter()) + .filter_map(|rule| rule.as_searchable_rule(schema, fastfields)) + .map(|(occur, rule)| { + BooleanQuery::from(vec![(occur, rule.query)]).box_clone() + }) + .collect_vec(), + ) + .box_clone(), + )] + } else { + self.rules + .iter() + .chain(self.site_rankings.rules().iter()) + .filter_map(|rule| rule.as_searchable_rule(schema, fastfields)) + .map(|(occur, rule)| (occur, rule.query)) + .collect() + } + } +} + +pub struct SearchableRule { + pub query: Box, + pub boost: f64, +} + +pub trait AsSearchableRule { + fn as_searchable_rule( + &self, + schema: &Schema, + fastfield_reader: &FastFieldReader, + ) -> Option<(Occur, SearchableRule)>; +} + +impl AsSearchableRule for Rule { + fn as_searchable_rule( + &self, + schema: &Schema, + fastfield_reader: &FastFieldReader, + ) -> Option<(Occur, SearchableRule)> { + let mut subqueries: Vec<_> = self + .matches + .iter() + .map(|matching| (Occur::Must, matching.as_tantivy(schema, fastfield_reader))) + .collect(); + + if subqueries.is_empty() { + return None; + } + + let subquery = if subqueries.len() == 1 { + subqueries.pop().unwrap().1 + } else { + Box::new(BooleanQuery::from(subqueries)) + }; + + match &self.action { + Action::Boost(boost) => Some(( + Occur::Should, + SearchableRule { + query: Box::new(ConstQuery::new(subquery, 1.0)), + boost: *boost as f64, + }, + )), + Action::Downrank(boost) => Some(( + Occur::Should, + SearchableRule { + query: Box::new(ConstQuery::new(subquery, 1.0)), + boost: *boost as f64 * -1.0, + }, + )), + Action::Discard => Some(( + Occur::MustNot, + SearchableRule { + query: subquery, + boost: 0.0, + }, + )), + } + } +} + +impl AsTantivyQuery for Matching { + fn as_tantivy( + &self, + schema: &Schema, + fastfield_reader: &FastFieldReader, + ) -> Box { + match &self.location { + MatchLocation::Site => ConstQuery::new( + PatternQuery::new( + self.pattern.clone(), + TextField::UrlForSiteOperator, + schema, + fastfield_reader.clone(), + ) + .box_clone(), + 1.0, + ) + .box_clone(), + MatchLocation::Url => Box::new(ConstQuery::new( + Box::new(PatternQuery::new( + self.pattern.clone(), + TextField::Url, + schema, + fastfield_reader.clone(), + )), + 1.0, + )), + MatchLocation::Domain => Box::new(ConstQuery::new( + Box::new(PatternQuery::new( + self.pattern.clone(), + TextField::Domain, + schema, + fastfield_reader.clone(), + )), + 1.0, + )), + MatchLocation::Title => Box::new(ConstQuery::new( + Box::new(PatternQuery::new( + self.pattern.clone(), + TextField::Title, + schema, + fastfield_reader.clone(), + )), + 1.0, + )), + MatchLocation::Description => UnionQuery::from(vec![ + Box::new(ConstQuery::new( + Box::new(PatternQuery::new( + self.pattern.clone(), + TextField::Description, + schema, + fastfield_reader.clone(), + )), + 1.0, + )) as Box, + Box::new(ConstQuery::new( + Box::new(PatternQuery::new( + self.pattern.clone(), + TextField::DmozDescription, + schema, + fastfield_reader.clone(), + )), + 1.0, + )) as Box, + ]) + .box_clone(), + MatchLocation::Content => Box::new(ConstQuery::new( + Box::new(PatternQuery::new( + self.pattern.clone(), + TextField::CleanBody, + schema, + fastfield_reader.clone(), + )), + 1.0, + )), + MatchLocation::MicroformatTag => Box::new(ConstQuery::new( + Box::new(PatternQuery::new( + self.pattern.clone(), + TextField::MicroformatTags, + schema, + fastfield_reader.clone(), + )), + 1.0, + )), + MatchLocation::Schema => Box::new(ConstQuery::new( + Box::new(PatternQuery::new( + self.pattern.clone(), + TextField::FlattenedSchemaOrgJson, + schema, + fastfield_reader.clone(), + )), + 1.0, + )), + } + } +} diff --git a/core/src/query/parser.rs b/crates/query/src/parser.rs similarity index 98% rename from core/src/query/parser.rs rename to crates/query/src/parser.rs index c9a478f55..de1df9ce7 100644 --- a/core/src/query/parser.rs +++ b/crates/query/src/parser.rs @@ -14,16 +14,13 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use schema::{Field, TextField, ALL_FIELDS}; use tantivy::{ query::{BooleanQuery, Occur, PhraseQuery, TermQuery}, tokenizer::Tokenizer, }; -use crate::{ - bangs::BANG_PREFIX, - floor_char_boundary, - schema::{Field, TextField, ALL_FIELDS}, -}; +use crate::bangs::BANG_PREFIX; #[derive(Debug, Clone)] pub struct TermCompound { @@ -392,7 +389,7 @@ pub fn parse(query: &str) -> Vec> { continue; } - cur_term_begin = floor_char_boundary(&query, cur_term_begin); + cur_term_begin = stdx::floor_char_boundary(&query, cur_term_begin); if query[cur_term_begin..].starts_with('"') { if let Some(offset) = query[cur_term_begin + 1..].find('"') { @@ -418,7 +415,7 @@ pub fn parse(query: &str) -> Vec> { if cur_term_begin < query.len() { res.push(parse_term( - &query[floor_char_boundary(&query, cur_term_begin)..query.len()], + &query[stdx::floor_char_boundary(&query, cur_term_begin)..query.len()], )); } diff --git a/core/src/query/pattern_query.rs b/crates/query/src/pattern_query.rs similarity index 99% rename from core/src/query/pattern_query.rs rename to crates/query/src/pattern_query.rs index 751b3051e..0a2db8d69 100644 --- a/core/src/query/pattern_query.rs +++ b/crates/query/src/pattern_query.rs @@ -17,6 +17,10 @@ use std::sync::Arc; use optics::PatternPart; +use schema::{ + fastfield_reader::{self, FastFieldReader}, + FastField, Field, TextField, ALL_FIELDS, +}; use tantivy::{ fieldnorm::FieldNormReader, postings::SegmentPostings, @@ -26,12 +30,7 @@ use tantivy::{ DocId, DocSet, Postings, Score, SegmentReader, TantivyError, TERMINATED, }; -use crate::{ - fastfield_reader::{self, FastFieldReader}, - query::intersection::Intersection, - ranking::bm25::Bm25Weight, - schema::{FastField, Field, TextField, ALL_FIELDS}, -}; +use crate::{bm25::Bm25Weight, intersection::Intersection}; #[derive(Clone)] pub struct PatternQuery { diff --git a/core/src/query/shortcircuit.rs b/crates/query/src/shortcircuit.rs similarity index 100% rename from core/src/query/shortcircuit.rs rename to crates/query/src/shortcircuit.rs diff --git a/core/src/query/union.rs b/crates/query/src/union.rs similarity index 100% rename from core/src/query/union.rs rename to crates/query/src/union.rs diff --git a/crates/schema/Cargo.toml b/crates/schema/Cargo.toml new file mode 100644 index 000000000..d886a7fc9 --- /dev/null +++ b/crates/schema/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "schema" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +stdx.workspace = true +tantivy.workspace = true +tokenizer.workspace = true diff --git a/core/src/fastfield_reader.rs b/crates/schema/src/fastfield_reader.rs similarity index 97% rename from core/src/fastfield_reader.rs rename to crates/schema/src/fastfield_reader.rs index 0fa273e11..70e41d351 100644 --- a/core/src/fastfield_reader.rs +++ b/crates/schema/src/fastfield_reader.rs @@ -16,12 +16,10 @@ use std::{collections::HashMap, sync::Arc}; +use stdx::enum_map::EnumMap; use tantivy::{columnar::ColumnValues, DocId, SegmentId}; -use crate::{ - enum_map::EnumMap, - schema::{DataType, FastField, ALL_FIELDS}, -}; +use crate::{DataType, FastField, ALL_FIELDS}; #[derive(Default, Clone)] struct InnerFastFieldReader { diff --git a/core/src/schema.rs b/crates/schema/src/lib.rs similarity index 99% rename from core/src/schema.rs rename to crates/schema/src/lib.rs index 7aefebdd1..c7d0ec740 100644 --- a/core/src/schema.rs +++ b/crates/schema/src/lib.rs @@ -14,9 +14,10 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use tantivy::schema::{IndexRecordOption, NumericOptions, TextFieldIndexing, TextOptions}; +pub mod fastfield_reader; -use crate::tokenizer::{ +use tantivy::schema::{IndexRecordOption, NumericOptions, TextFieldIndexing, TextOptions}; +use tokenizer::{ BigramTokenizer, Identity, JsonField, SiteOperatorUrlTokenizer, Tokenizer, TrigramTokenizer, }; diff --git a/crates/simhash/Cargo.toml b/crates/simhash/Cargo.toml new file mode 100644 index 000000000..a6efdfcaf --- /dev/null +++ b/crates/simhash/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "simhash" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tantivy.workspace = true +tokenizer.workspace = true diff --git a/core/src/simhash.rs b/crates/simhash/src/lib.rs similarity index 99% rename from core/src/simhash.rs rename to crates/simhash/src/lib.rs index 08b275b42..11609e476 100644 --- a/core/src/simhash.rs +++ b/crates/simhash/src/lib.rs @@ -18,8 +18,7 @@ use std::{ collections::{hash_map::DefaultHasher, HashMap}, hash::{Hash, Hasher}, }; - -use crate::tokenizer::Tokenizer; +use tokenizer::Tokenizer; pub type HashType = u64; diff --git a/crates/spell/Cargo.toml b/crates/spell/Cargo.toml new file mode 100644 index 000000000..e42809b28 --- /dev/null +++ b/crates/spell/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "spell" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +fst.workspace = true +itertools.workspace = true +schema.workspace = true +serde.workspace = true +stdx.workspace = true +stract-query.workspace = true +tantivy.workspace = true +thiserror.workspace = true +tracing.workspace = true diff --git a/core/src/spell/dictionary.rs b/crates/spell/src/dictionary.rs similarity index 99% rename from core/src/spell/dictionary.rs rename to crates/spell/src/dictionary.rs index 5b913e033..15d4e07ea 100644 --- a/core/src/spell/dictionary.rs +++ b/crates/spell/src/dictionary.rs @@ -13,7 +13,7 @@ // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::spell::distance::LevenshteinDistance; +use crate::distance::LevenshteinDistance; use std::cmp::Reverse; use std::collections::{BTreeMap, BinaryHeap}; use std::hash::{Hash, Hasher}; diff --git a/core/src/spell/distance.rs b/crates/spell/src/distance.rs similarity index 100% rename from core/src/spell/distance.rs rename to crates/spell/src/distance.rs diff --git a/core/src/spell/mod.rs b/crates/spell/src/lib.rs similarity index 92% rename from core/src/spell/mod.rs rename to crates/spell/src/lib.rs index 6c8008432..594309bce 100644 --- a/core/src/spell/mod.rs +++ b/crates/spell/src/lib.rs @@ -17,20 +17,15 @@ pub mod dictionary; pub mod distance; pub mod spell_checker; pub mod splitter; -pub mod word2vec; use std::ops::Range; use itertools::intersperse; +use schema::TextField; use serde::{Deserialize, Serialize}; +use stract_query::parser::Term; use tracing::info; -use crate::index::Index; -use crate::query::parser::Term; -use crate::schema::TextField; -use crate::searcher::SearchQuery; -use crate::{floor_char_boundary, query}; - use self::dictionary::DictionaryBuilder; pub use self::dictionary::{Dictionary, DictionaryResult, EditStrategy, LogarithmicEdit}; pub use self::spell_checker::SpellChecker; @@ -91,7 +86,7 @@ pub fn sentence_ranges(text: &str) -> Vec> { .filter(|(_, c)| matches!(c, '.' | '\n' | '?' | '!')) { res.push(last_start..end + 1); - last_start = floor_char_boundary(text, end + 2); + last_start = stdx::floor_char_boundary(text, end + 2); } res @@ -103,8 +98,8 @@ pub struct Spell { } impl Spell { - pub fn for_index(index: &Index) -> Self { - let dict = Self::build_dict(index); + pub fn for_searcher(searcher: tantivy::Searcher) -> Self { + let dict = Self::build_dict(searcher); let spell_checker = SpellChecker::new(dict.clone(), LogarithmicEdit::new(3)); Self { @@ -112,9 +107,8 @@ impl Spell { spell_checker, } } - fn build_dict(index: &Index) -> Dictionary { + fn build_dict(searcher: tantivy::Searcher) -> Dictionary { info!("Building spell correction dictionary"); - let searcher = index.inverted_index.tv_searcher(); let schema = searcher.schema(); let mut dict = DictionaryBuilder::new(20_000); @@ -223,8 +217,8 @@ impl Spell { } } - pub fn correction(&self, query: &SearchQuery) -> Option { - let terms: Vec<_> = query::parser::parse(&query.query) + pub fn correction(&self, query: &str) -> Option { + let terms: Vec<_> = stract_query::parser::parse(query) .into_iter() .filter_map(|term| match *term { Term::Simple(s) => Some(String::from(s)), diff --git a/core/src/spell/segment.srx b/crates/spell/src/segment.srx similarity index 100% rename from core/src/spell/segment.srx rename to crates/spell/src/segment.srx diff --git a/core/src/spell/spell_checker.rs b/crates/spell/src/spell_checker.rs similarity index 97% rename from core/src/spell/spell_checker.rs rename to crates/spell/src/spell_checker.rs index 57c4233b2..063b62a79 100644 --- a/core/src/spell/spell_checker.rs +++ b/crates/spell/src/spell_checker.rs @@ -13,14 +13,14 @@ // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::spell::dictionary::EditStrategy; -use crate::spell::Dictionary; +use crate::{ + dictionary::{EditStrategy, TermId}, + distance::LevenshteinDistance, + Dictionary, +}; use std::cmp::Ordering; use std::collections::{BTreeMap, HashSet, VecDeque}; -use super::dictionary::TermId; -use super::distance::LevenshteinDistance; - pub struct SpellChecker { dict: Dictionary, edit_strategy: T, @@ -170,7 +170,7 @@ impl SpellChecker { #[cfg(test)] mod tests { use super::*; - use crate::spell::{ + use crate::{ dictionary::{self, MaxEdit}, LogarithmicEdit, }; diff --git a/core/src/spell/splitter.rs b/crates/spell/src/splitter.rs similarity index 97% rename from core/src/spell/splitter.rs rename to crates/spell/src/splitter.rs index 2d66409d3..682a1bb32 100644 --- a/core/src/spell/splitter.rs +++ b/crates/spell/src/splitter.rs @@ -13,7 +13,7 @@ // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::spell::Dictionary; +use crate::Dictionary; pub struct TermSplitter<'a> { dict: &'a Dictionary, @@ -68,7 +68,7 @@ impl<'a> TermSplitter<'a> { #[cfg(test)] mod tests { - use crate::spell::dictionary; + use crate::dictionary; use super::*; diff --git a/crates/stdx/Cargo.toml b/crates/stdx/Cargo.toml new file mode 100644 index 000000000..6c56b6a19 --- /dev/null +++ b/crates/stdx/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "stdx" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +md5.workspace = true +serde = { workspace = true, features = ["derive"] } diff --git a/core/src/directory.rs b/crates/stdx/src/directory.rs similarity index 90% rename from core/src/directory.rs rename to crates/stdx/src/directory.rs index 9d33ee333..91b48ab31 100644 --- a/core/src/directory.rs +++ b/crates/stdx/src/directory.rs @@ -14,7 +14,6 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::Result; use serde::{Deserialize, Serialize}; use std::fs; @@ -30,7 +29,7 @@ pub enum DirEntry { }, } -fn iterate_children(path: &str) -> Result> { +fn iterate_children(path: &str) -> Result, std::io::Error> { let mut res = Vec::new(); for f in fs::read_dir(path)? { @@ -53,7 +52,7 @@ fn iterate_children(path: &str) -> Result> { Ok(res) } -pub fn recreate_folder(entry: &DirEntry) -> Result<()> { +pub fn recreate_folder(entry: &DirEntry) -> Result<(), std::io::Error> { match entry { DirEntry::Folder { name, entries } => { fs::create_dir(name)?; @@ -68,7 +67,7 @@ pub fn recreate_folder(entry: &DirEntry) -> Result<()> { } } -pub fn scan_folder(path: String) -> Result { +pub fn scan_folder(path: String) -> Result { Ok(DirEntry::Folder { entries: iterate_children(&path)?, name: path, diff --git a/core/src/enum_map.rs b/crates/stdx/src/enum_map.rs similarity index 100% rename from core/src/enum_map.rs rename to crates/stdx/src/enum_map.rs diff --git a/core/src/intmap.rs b/crates/stdx/src/intmap.rs similarity index 100% rename from core/src/intmap.rs rename to crates/stdx/src/intmap.rs diff --git a/core/src/leaky_queue.rs b/crates/stdx/src/leaky_queue.rs similarity index 100% rename from core/src/leaky_queue.rs rename to crates/stdx/src/leaky_queue.rs diff --git a/crates/stdx/src/lib.rs b/crates/stdx/src/lib.rs new file mode 100644 index 000000000..31bae86c9 --- /dev/null +++ b/crates/stdx/src/lib.rs @@ -0,0 +1,74 @@ +pub mod directory; +pub mod enum_map; +pub mod intmap; +pub mod leaky_queue; +pub mod prehashed; + +use std::path::PathBuf; + +// taken from https://docs.rs/sled/0.34.7/src/sled/config.rs.html#445 +pub fn gen_temp_path() -> PathBuf { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::time::SystemTime; + + static SALT_COUNTER: AtomicUsize = AtomicUsize::new(0); + + let seed = SALT_COUNTER.fetch_add(1, Ordering::SeqCst) as u128; + + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_nanos() + << 48; + + let pid = u128::from(std::process::id()); + + let salt = (pid << 16) + now + seed; + + if cfg!(target_os = "linux") { + // use shared memory for temporary linux files + format!("/dev/shm/pagecache.tmp.{salt}").into() + } else { + std::env::temp_dir().join(format!("pagecache.tmp.{salt}")) + } +} + +pub fn ceil_char_boundary(str: &str, index: usize) -> usize { + let mut res = index; + + while !str.is_char_boundary(res) && res < str.len() { + res += 1; + } + + res +} + +pub fn floor_char_boundary(str: &str, index: usize) -> usize { + let mut res = index; + + while !str.is_char_boundary(res) && res > 0 { + res -= 1; + } + + res +} + +pub fn split_u128(num: u128) -> [u64; 2] { + [(num >> 64) as u64, num as u64] +} + +pub fn combine_u64s(nums: [u64; 2]) -> u128 { + ((nums[0] as u128) << 64) | (nums[1] as u128) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn split_combine_u128() { + for num in 0..10000_u128 { + assert_eq!(combine_u64s(split_u128(num)), num); + } + } +} diff --git a/core/src/prehashed.rs b/crates/stdx/src/prehashed.rs similarity index 100% rename from core/src/prehashed.rs rename to crates/stdx/src/prehashed.rs diff --git a/crates/warc/Cargo.toml b/crates/warc/Cargo.toml new file mode 100644 index 000000000..91cbd894a --- /dev/null +++ b/crates/warc/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "warc" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +chrono.workspace = true +encoding_rs = "0.8.33" +flate2.workspace = true +thiserror.workspace = true + +[dev-dependencies] +proptest.workspace = true +proptest-derive.workspace = true diff --git a/core/src/warc.rs b/crates/warc/src/lib.rs similarity index 77% rename from core/src/warc.rs rename to crates/warc/src/lib.rs index 200429a03..ed56300a8 100644 --- a/core/src/warc.rs +++ b/crates/warc/src/lib.rs @@ -14,15 +14,12 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::distributed::retry_strategy::ExponentialBackoff; -use crate::{config::S3Config, config::WarcSource, Error, Result}; +// use crate::{Error, Result}; use std::collections::BTreeMap; use std::fs::File; -use std::io::{BufRead, BufReader, Cursor, Read, Seek, Write}; +use std::io::{BufRead, BufReader, Read, Write}; use std::path::Path; use std::str::FromStr; -use std::thread::sleep; -use std::time::Duration; use flate2::read::MultiGzDecoder; use flate2::write::GzEncoder; @@ -30,7 +27,15 @@ use flate2::Compression; #[cfg(test)] use proptest_derive::Arbitrary; -use tracing::{debug, trace}; +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("IO error")] + Io(#[from] std::io::Error), + #[error("Failed to parse WARC file")] + Parse(&'static str), +} + +pub type Result = std::result::Result; pub struct WarcFile { bytes: Vec, @@ -68,7 +73,7 @@ impl WarcFile { Self { bytes } } - pub fn open>(path: P) -> Result { + pub fn open(path: &Path) -> Result { let file = File::open(path)?; let mut reader = BufReader::new(file); @@ -84,122 +89,6 @@ impl WarcFile { num_reads: 0, } } - - pub(crate) fn download(source: &WarcSource, warc_path: &str) -> Result { - let mut cursor = Cursor::new(Vec::new()); - Self::download_into_buf(source, warc_path, &mut cursor)?; - cursor.rewind()?; - - let mut buf = Vec::new(); - cursor.read_to_end(&mut buf)?; - - Ok(Self::new(buf)) - } - - pub(crate) fn download_into_buf( - source: &WarcSource, - warc_path: &str, - buf: &mut W, - ) -> Result<()> { - for dur in ExponentialBackoff::from_millis(10) - .with_limit(Duration::from_secs(30)) - .take(35) - { - let res = match source.clone() { - WarcSource::HTTP(config) => { - WarcFile::download_from_http(warc_path, config.base_url, buf) - } - WarcSource::Local(config) => { - WarcFile::load_from_folder(warc_path, &config.folder, buf) - } - WarcSource::S3(config) => WarcFile::download_from_s3(warc_path, &config, buf), - }; - - if res.is_ok() { - return Ok(()); - } else { - trace!("Error {:?}", res); - } - - debug!("warc download failed: {:?}", res.err().unwrap()); - debug!("retrying in {} ms", dur.as_millis()); - - sleep(dur); - } - - Err(Error::DownloadFailed.into()) - } - - fn load_from_folder(name: &str, folder: &str, buf: &mut W) -> Result<()> { - let f = File::open(Path::new(folder).join(name))?; - let mut reader = BufReader::new(f); - - buf.rewind()?; - - std::io::copy(&mut reader, buf)?; - - Ok(()) - } - - fn download_from_http( - warc_path: &str, - base_url: String, - buf: &mut W, - ) -> Result<()> { - let mut url = base_url; - if !url.ends_with('/') { - url += "/"; - } - url += warc_path; - - let client = reqwest::blocking::ClientBuilder::new() - .tcp_keepalive(None) - .pool_idle_timeout(Duration::from_secs(30 * 60)) - .timeout(Duration::from_secs(30 * 60)) - .connect_timeout(Duration::from_secs(30 * 60)) - .build()?; - let res = client.get(url).send()?; - - if res.status().as_u16() != 200 { - return Err(Error::DownloadFailed.into()); - } - - let bytes = res.bytes()?; - - buf.rewind()?; - std::io::copy(&mut &bytes[..], buf)?; - - Ok(()) - } - - fn download_from_s3( - warc_path: &str, - config: &S3Config, - buf: &mut W, - ) -> Result<()> { - let bucket = s3::Bucket::new( - &config.bucket, - s3::Region::Custom { - region: "".to_string(), - endpoint: config.endpoint.clone(), - }, - s3::creds::Credentials { - access_key: Some(config.access_key.clone()), - secret_key: Some(config.secret_key.clone()), - security_token: None, - session_token: None, - expiration: None, - }, - )? - .with_path_style() - .with_request_timeout(Duration::from_secs(30 * 60)); - - let res = bucket.get_object_blocking(warc_path)?; - - buf.write_all(res.bytes())?; - - Ok(()) - } } #[derive(Debug)] @@ -229,7 +118,7 @@ impl Request { url: record .header .get("WARC-TARGET-URI") - .ok_or(Error::WarcParse("No target url"))? + .ok_or(Error::Parse("No target url"))? .to_owned(), }) } @@ -254,7 +143,7 @@ impl FromStr for PayloadType { "application/pdf" => Ok(Self::Pdf), "application/rss" => Ok(Self::Rss), "application/atom" => Ok(Self::Atom), - _ => Err(Error::WarcParse("Unknown payload type")), + _ => Err(Error::Parse("Unknown payload type")), } } } @@ -283,7 +172,7 @@ impl Response { let (_header, content) = content .split_once("\r\n\r\n") - .ok_or(Error::WarcParse("Invalid http body"))?; + .ok_or(Error::Parse("Invalid http body"))?; Ok(Self { body: content.to_string(), @@ -313,13 +202,15 @@ impl Metadata { line.pop(); // remove colon let key = line; if key == "fetchTimeMs" { - let fetch_time_ms = value.parse::()?; + let fetch_time_ms = value + .parse::() + .map_err(|_| Error::Parse("failed to parse 'fetchTimeMs'"))?; return Ok(Self { fetch_time_ms }); } } } - Err(Error::WarcParse("Failed to parse metadata").into()) + Err(Error::Parse("Failed to parse metadata")) } } @@ -343,7 +234,7 @@ impl RecordIterator { rtrim(&mut version); if !version.to_uppercase().starts_with("WARC/1.") { - return Some(Err(Error::WarcParse("Unknown WARC version").into())); + return Some(Err(Error::Parse("Unknown WARC version"))); } let mut header = BTreeMap::::new(); @@ -376,23 +267,18 @@ impl RecordIterator { header.insert(key.to_ascii_uppercase(), value); } else { - return Some(Err(Error::WarcParse( - "All header lines must contain a colon", - ) - .into())); + return Some(Err(Error::Parse("All header lines must contain a colon"))); } } let content_len = header.get("CONTENT-LENGTH"); if content_len.is_none() { - return Some(Err(Error::WarcParse("Record has no content-length").into())); + return Some(Err(Error::Parse("Record has no content-length"))); } let content_len = content_len.unwrap().parse::(); if content_len.is_err() { - return Some(Err( - Error::WarcParse("Could not parse content length").into() - )); + return Some(Err(Error::Parse("Could not parse content length"))); } let content_len = content_len.unwrap(); @@ -407,7 +293,7 @@ impl RecordIterator { } if linefeed != [13, 10, 13, 10] { - return Some(Err(Error::WarcParse("Invalid record ending").into())); + return Some(Err(Error::Parse("Invalid record ending"))); } let record = RawWarcRecord { header, content }; @@ -456,17 +342,11 @@ impl Iterator for RecordIterator { match response.header.get("WARC-TYPE") { Some(warc_type) => { if warc_type.as_str() != "response" { - return Some(Err(Error::WarcParse( - "Expected response, got something else", - ) - .into())); + return Some(Err(Error::Parse("Expected response, got something else"))); } } None => { - return Some(Err(Error::WarcParse( - "Expected response, got something else", - ) - .into())); + return Some(Err(Error::Parse("Expected response, got something else"))); } } @@ -483,27 +363,18 @@ impl Iterator for RecordIterator { match metadata.header.get("WARC-TYPE") { Some(warc_type) => { if warc_type.as_str() != "metadata" { - return Some(Err(Error::WarcParse( - "Expected metadata, got something else", - ) - .into())); + return Some(Err(Error::Parse("Expected metadata, got something else"))); } } None => { - return Some(Err(Error::WarcParse( - "Expected metadata, got something else", - ) - .into())); + return Some(Err(Error::Parse("Expected metadata, got something else"))); } } let metadata = Metadata::from_raw(metadata); if request.is_err() || response.is_err() || metadata.is_err() { - return Some(Err(Error::WarcParse( - "Request, response or metadata is error", - ) - .into())); + return Some(Err(Error::Parse("Request, response or metadata is error"))); } let request = request.unwrap(); diff --git a/crates/webgraph/Cargo.toml b/crates/webgraph/Cargo.toml new file mode 100644 index 000000000..0cec5f240 --- /dev/null +++ b/crates/webgraph/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "webgraph" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +bincode.workspace = true +bitvec.workspace = true +executor = { version = "0.1.0", path = "../executor" } +hyperloglog.workspace = true +indicatif.workspace = true +itertools.workspace = true +kv.workspace = true +lz4_flex.workspace = true +md5.workspace = true +memmap2.workspace = true +rayon.workspace = true +rocksdb.workspace = true +serde = { workspace = true, features = ["derive"] } +serde_json.workspace = true +stdx.workspace = true +tracing.workspace = true +url.workspace = true +utoipa.workspace = true +uuid.workspace = true + +[dev-dependencies] +maplit.workspace = true diff --git a/core/src/bloom.rs b/crates/webgraph/src/bloom.rs similarity index 100% rename from core/src/bloom.rs rename to crates/webgraph/src/bloom.rs diff --git a/core/src/webgraph/centrality/betweenness.rs b/crates/webgraph/src/centrality/betweenness.rs similarity index 95% rename from core/src/webgraph/centrality/betweenness.rs rename to crates/webgraph/src/centrality/betweenness.rs index 330bd5638..b2a30ce47 100644 --- a/core/src/webgraph/centrality/betweenness.rs +++ b/crates/webgraph/src/centrality/betweenness.rs @@ -21,11 +21,9 @@ use std::collections::{HashMap, VecDeque}; use indicatif::{ProgressBar, ProgressStyle}; use serde::{Deserialize, Serialize}; +use stdx::intmap::IntMap; -use crate::{ - intmap::IntMap, - webgraph::{Node, NodeID, Webgraph}, -}; +use crate::{Node, NodeID, Webgraph}; fn calculate(graph: &Webgraph, with_progress: bool) -> (HashMap, i32) { let mut centrality: HashMap = HashMap::new(); @@ -164,15 +162,15 @@ impl Betweenness { mod tests { use maplit::hashmap; - use crate::webgraph::WebgraphWriter; + use crate::WebgraphWriter; use super::*; fn create_path_graph(n: usize) -> Webgraph { let mut writer = WebgraphWriter::new( - crate::gen_temp_path(), - crate::executor::Executor::single_thread(), - crate::webgraph::Compression::default(), + &stdx::gen_temp_path(), + executor::Executor::single_thread(), + crate::Compression::default(), ); for i in 0..n - 1 { diff --git a/core/src/webgraph/centrality/derived_harmonic.rs b/crates/webgraph/src/centrality/derived_harmonic.rs similarity index 90% rename from core/src/webgraph/centrality/derived_harmonic.rs rename to crates/webgraph/src/centrality/derived_harmonic.rs index 74d8db5d1..b6c764f4e 100644 --- a/core/src/webgraph/centrality/derived_harmonic.rs +++ b/crates/webgraph/src/centrality/derived_harmonic.rs @@ -19,14 +19,11 @@ //! derive a centrality measure for the page graph. use anyhow::Result; +use kv::{rocksdb_store::RocksDbStore, Kv}; use rayon::prelude::*; use std::{collections::BTreeMap, path::Path, sync::Mutex}; -use crate::{ - bloom::BloomFilter, - kv::{rocksdb_store::RocksDbStore, Kv}, - webgraph::{NodeID, Webgraph}, -}; +use crate::{bloom::BloomFilter, NodeID, Webgraph}; struct BloomMap { map: Vec>, @@ -67,17 +64,17 @@ pub struct DerivedCentrality { } impl DerivedCentrality { - pub fn open>(path: P) -> Self { + pub fn open(path: &Path) -> Self { let inner = RocksDbStore::open(path); Self { inner } } - pub fn build>( + pub fn build( host_harmonic: &RocksDbStore, page_graph: &Webgraph, - output: P, + output: &Path, ) -> Result { - if output.as_ref().exists() { + if output.exists() { return Err(anyhow::anyhow!("output path already exists")); } @@ -91,7 +88,7 @@ impl DerivedCentrality { let has_outgoing = has_outgoing.finalize(); - let non_normalized = RocksDbStore::open(output.as_ref().join("non_normalized")); + let non_normalized = RocksDbStore::open(&output.join("non_normalized")); let norms: Mutex> = Mutex::new(BTreeMap::new()); @@ -126,7 +123,7 @@ impl DerivedCentrality { let norms = norms.into_inner().unwrap(); - let db = RocksDbStore::open(output.as_ref()); + let db = RocksDbStore::open(output); for (id, score) in non_normalized.iter() { let node = page_graph.id2node(&id).unwrap().into_host().id(); let norm = norms.get(&node).unwrap(); @@ -136,7 +133,7 @@ impl DerivedCentrality { db.flush(); drop(non_normalized); - std::fs::remove_dir_all(output.as_ref().join("non_normalized"))?; + std::fs::remove_dir_all(output.join("non_normalized"))?; Ok(Self { inner: db }) } diff --git a/core/src/webgraph/centrality/harmonic.rs b/crates/webgraph/src/centrality/harmonic.rs similarity index 95% rename from core/src/webgraph/centrality/harmonic.rs rename to crates/webgraph/src/centrality/harmonic.rs index 3b1d55f6f..a37c4ea5a 100644 --- a/core/src/webgraph/centrality/harmonic.rs +++ b/crates/webgraph/src/centrality/harmonic.rs @@ -22,13 +22,10 @@ use std::{ use std::sync::atomic::Ordering; use crate::bloom::BloomFilter; +use hyperloglog::HyperLogLog; use tracing::info; -use crate::{ - hyperloglog::HyperLogLog, - kahan_sum::KahanSum, - webgraph::{NodeID, Webgraph}, -}; +use crate::{kahan_sum::KahanSum, NodeID, Webgraph}; const HYPERLOGLOG_COUNTERS: usize = 64; @@ -192,7 +189,7 @@ impl HarmonicCentrality { #[cfg(test)] mod tests { use super::*; - use crate::webgraph::{Node, WebgraphWriter}; + use crate::{Node, WebgraphWriter}; fn test_edges() -> Vec<(Node, Node, String)> { // ┌────┐ @@ -216,9 +213,9 @@ mod tests { fn test_graph() -> Webgraph { let mut writer = WebgraphWriter::new( - crate::gen_temp_path(), - crate::executor::Executor::single_thread(), - crate::webgraph::Compression::default(), + &stdx::gen_temp_path(), + executor::Executor::single_thread(), + crate::Compression::default(), ); for (from, to, label) in test_edges() { @@ -231,9 +228,9 @@ mod tests { #[test] fn host_harmonic_centrality() { let mut writer = WebgraphWriter::new( - crate::gen_temp_path(), - crate::executor::Executor::single_thread(), - crate::webgraph::Compression::default(), + &stdx::gen_temp_path(), + executor::Executor::single_thread(), + crate::Compression::default(), ); writer.insert( @@ -339,9 +336,9 @@ mod tests { let centrality = HarmonicCentrality::calculate(&graph); let mut other = WebgraphWriter::new( - crate::gen_temp_path(), - crate::executor::Executor::single_thread(), - crate::webgraph::Compression::default(), + &stdx::gen_temp_path(), + executor::Executor::single_thread(), + crate::Compression::default(), ); for (from, to, label) in test_edges() { diff --git a/core/src/webgraph/centrality/mod.rs b/crates/webgraph/src/centrality/mod.rs similarity index 97% rename from core/src/webgraph/centrality/mod.rs rename to crates/webgraph/src/centrality/mod.rs index 68eab2d09..da8c48467 100644 --- a/core/src/webgraph/centrality/mod.rs +++ b/crates/webgraph/src/centrality/mod.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::kv::{rocksdb_store::RocksDbStore, Kv}; +use kv::{rocksdb_store::RocksDbStore, Kv}; use super::NodeID; diff --git a/core/src/kahan_sum.rs b/crates/webgraph/src/kahan_sum.rs similarity index 100% rename from core/src/kahan_sum.rs rename to crates/webgraph/src/kahan_sum.rs diff --git a/core/src/webgraph/mod.rs b/crates/webgraph/src/lib.rs similarity index 93% rename from core/src/webgraph/mod.rs rename to crates/webgraph/src/lib.rs index 3be628abd..27436c541 100644 --- a/core/src/webgraph/mod.rs +++ b/crates/webgraph/src/lib.rs @@ -13,6 +13,7 @@ // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +mod bloom; mod segment; use std::collections::{BTreeMap, BinaryHeap}; @@ -22,14 +23,14 @@ use std::path::Path; use std::sync::Arc; use std::{cmp, fs}; +use executor::Executor; use rayon::prelude::*; +use stdx::intmap; use url::Url; use utoipa::ToSchema; -use crate::executor::Executor; -use crate::intmap; - pub mod centrality; +pub mod kahan_sum; mod store; use self::segment::{Segment, SegmentWriter}; @@ -309,9 +310,9 @@ pub struct WebgraphBuilder { } impl WebgraphBuilder { - pub fn new>(path: P) -> Self { + pub fn new(path: &Path) -> Self { Self { - path: path.as_ref().into(), + path: path.into(), executor: Executor::multi_thread("webgraph").unwrap(), compression: Compression::default(), } @@ -328,7 +329,7 @@ impl WebgraphBuilder { } pub fn open(self) -> Webgraph { - Webgraph::open(self.path, self.executor, self.compression) + Webgraph::open(&self.path, self.executor, self.compression) } } @@ -416,7 +417,7 @@ struct Meta { } impl Meta { - fn open>(path: P) -> Self { + fn open(path: &Path) -> Self { let mut reader = BufReader::new( File::options() .create(true) @@ -430,7 +431,7 @@ impl Meta { serde_json::from_str(&buf).unwrap_or_default() } - fn save>(&self, path: P) { + fn save(&self, path: &Path) { let mut writer = BufWriter::new( File::options() .create(true) @@ -451,7 +452,7 @@ struct Id2NodeDb { } impl Id2NodeDb { - fn open>(path: P) -> Self { + fn open(path: &Path) -> Self { let mut opts = rocksdb::Options::default(); opts.create_if_missing(true); opts.optimize_for_point_lookup(512); @@ -537,32 +538,32 @@ pub struct WebgraphWriter { } impl WebgraphWriter { - fn meta>(path: P) -> Meta { - let meta_path = path.as_ref().join("metadata.json"); - Meta::open(meta_path) + fn meta(path: &Path) -> Meta { + let meta_path = path.join("metadata.json"); + Meta::open(&meta_path) } fn save_metadata(&mut self) { let path = Path::new(&self.path).join("metadata.json"); - self.meta.save(path); + self.meta.save(&path); } - pub fn new>(path: P, executor: Executor, compression: Compression) -> Self { - fs::create_dir_all(&path).unwrap(); - let mut meta = Self::meta(&path); + pub fn new(path: &Path, executor: Executor, compression: Compression) -> Self { + fs::create_dir_all(path).unwrap(); + let mut meta = Self::meta(path); meta.comitted_segments.clear(); - fs::create_dir_all(path.as_ref().join("segments")).unwrap(); + fs::create_dir_all(path.join("segments")).unwrap(); let id = uuid::Uuid::new_v4().to_string(); - let segment = SegmentWriter::open(path.as_ref().join("segments"), id.clone(), compression); + let segment = SegmentWriter::open(&path.join("segments"), id.clone(), compression); meta.comitted_segments.push(id); Self { - path: path.as_ref().as_os_str().to_str().unwrap().to_string(), + path: path.as_os_str().to_str().unwrap().to_string(), segment, - id2node: Id2NodeDb::open(path.as_ref().join("id2node")), + id2node: Id2NodeDb::open(&path.join("id2node")), insert_batch: Vec::with_capacity(store::MAX_BATCH_SIZE), executor, meta, @@ -638,36 +639,36 @@ pub struct Webgraph { } impl Webgraph { - fn meta>(path: P) -> Meta { - let meta_path = path.as_ref().join("metadata.json"); - Meta::open(meta_path) + fn meta(path: &Path) -> Meta { + let meta_path = path.join("metadata.json"); + Meta::open(&meta_path) } fn save_metadata(&mut self) { let path = Path::new(&self.path).join("metadata.json"); - self.meta.save(path); + self.meta.save(&path); } - fn open>(path: P, executor: Executor, compression: Compression) -> Self { - fs::create_dir_all(&path).unwrap(); - let meta = Self::meta(&path); + fn open(path: &Path, executor: Executor, compression: Compression) -> Self { + fs::create_dir_all(path).unwrap(); + let meta = Self::meta(path); - fs::create_dir_all(path.as_ref().join("segments")).unwrap(); + fs::create_dir_all(path.join("segments")).unwrap(); let mut segments = Vec::new(); for segment in &meta.comitted_segments { segments.push(Segment::open( - path.as_ref().join("segments"), + &path.join("segments"), segment.clone(), compression, )); } Self { - path: path.as_ref().as_os_str().to_str().unwrap().to_string(), + path: path.as_os_str().to_str().unwrap().to_string(), segments, executor: Arc::new(executor), - id2node: Id2NodeDb::open(path.as_ref().join("id2node")), + id2node: Id2NodeDb::open(&path.join("id2node")), meta, compression, } @@ -684,7 +685,7 @@ impl Webgraph { self.meta.comitted_segments.push(segment.id()); drop(segment); self.segments - .push(Segment::open(new_path, id, self.compression)); + .push(Segment::open(&new_path, id, self.compression)); } self.save_metadata(); @@ -837,7 +838,7 @@ mod test { // D let mut graph = WebgraphWriter::new( - crate::gen_temp_path(), + &stdx::gen_temp_path(), Executor::single_thread(), Compression::default(), ); @@ -899,7 +900,7 @@ mod test { (Node::from("G"), Node::from("H"), String::new()), ] { let mut wrt = WebgraphWriter::new( - crate::gen_temp_path(), + &stdx::gen_temp_path(), Executor::single_thread(), Compression::default(), ); @@ -928,7 +929,7 @@ mod test { (Node::from("C"), Node::from("A"), String::new()), ] { let mut wrt = WebgraphWriter::new( - crate::gen_temp_path(), + &stdx::gen_temp_path(), Executor::single_thread(), Compression::default(), ); @@ -970,7 +971,7 @@ mod test { #[test] fn cap_label_length() { let mut writer = WebgraphWriter::new( - crate::gen_temp_path(), + &stdx::gen_temp_path(), Executor::single_thread(), Compression::default(), ); @@ -993,7 +994,7 @@ mod test { #[test] fn edges_by_host() { let mut writer = WebgraphWriter::new( - crate::gen_temp_path(), + &stdx::gen_temp_path(), Executor::single_thread(), Compression::default(), ); diff --git a/core/src/webgraph/segment.rs b/crates/webgraph/src/segment.rs similarity index 86% rename from core/src/webgraph/segment.rs rename to crates/webgraph/src/segment.rs index 715d8671d..13c256204 100644 --- a/core/src/webgraph/segment.rs +++ b/crates/webgraph/src/segment.rs @@ -32,27 +32,19 @@ pub struct SegmentWriter { } impl SegmentWriter { - pub fn open>(folder_path: P, id: String, compression: Compression) -> Self { + pub fn open(folder_path: &Path, id: String, compression: Compression) -> Self { SegmentWriter { full_adjacency: EdgeStoreWriter::open( - folder_path.as_ref().join(&id).join(ADJACENCY_STORE), + &folder_path.join(&id).join(ADJACENCY_STORE), compression, false, ), full_reversed_adjacency: EdgeStoreWriter::open( - folder_path - .as_ref() - .join(&id) - .join(REVERSED_ADJACENCY_STORE), + &folder_path.join(&id).join(REVERSED_ADJACENCY_STORE), compression, true, ), - folder_path: folder_path - .as_ref() - .as_os_str() - .to_str() - .unwrap() - .to_string(), + folder_path: folder_path.as_os_str().to_str().unwrap().to_string(), id, } } @@ -87,27 +79,19 @@ pub struct Segment { } impl Segment { - pub fn open>(folder_path: P, id: String, compression: Compression) -> Self { + pub fn open(folder_path: &Path, id: String, compression: Compression) -> Self { Segment { full_adjacency: EdgeStore::open( - folder_path.as_ref().join(&id).join(ADJACENCY_STORE), + &folder_path.join(&id).join(ADJACENCY_STORE), false, compression, ), full_reversed_adjacency: EdgeStore::open( - folder_path - .as_ref() - .join(&id) - .join(REVERSED_ADJACENCY_STORE), + &folder_path.join(&id).join(REVERSED_ADJACENCY_STORE), true, compression, ), - folder_path: folder_path - .as_ref() - .as_os_str() - .to_str() - .unwrap() - .to_string(), + folder_path: folder_path.as_os_str().to_str().unwrap().to_string(), id, } } @@ -151,7 +135,7 @@ impl Segment { #[cfg(test)] mod test { - use crate::webgraph::FullNodeID; + use crate::FullNodeID; use super::*; @@ -165,7 +149,7 @@ mod test { // 1─────►2◄┘ let mut writer = SegmentWriter::open( - crate::gen_temp_path(), + &stdx::gen_temp_path(), "test".to_string(), Compression::default(), ); diff --git a/core/src/webgraph/store.rs b/crates/webgraph/src/store.rs similarity index 96% rename from core/src/webgraph/store.rs rename to crates/webgraph/src/store.rs index bd9711ca1..dfa79dc4c 100644 --- a/core/src/webgraph/store.rs +++ b/crates/webgraph/src/store.rs @@ -38,7 +38,7 @@ pub struct EdgeStoreWriter { } impl EdgeStoreWriter { - pub fn open>(path: P, compression: Compression, reversed: bool) -> Self { + pub fn open(path: &Path, compression: Compression, reversed: bool) -> Self { let mut options = rocksdb::Options::default(); options.create_if_missing(true); @@ -69,7 +69,7 @@ impl EdgeStoreWriter { options.set_block_based_table_factory(&block_options); options.set_compression_type(rocksdb::DBCompressionType::Lz4); - let db = rocksdb::DB::open(&options, path.as_ref().join("writer")).unwrap(); + let db = rocksdb::DB::open(&options, path.join("writer")).unwrap(); Self { db, @@ -163,7 +163,7 @@ struct PrefixDb { } impl PrefixDb { - fn open>(path: P) -> Self { + fn open(path: &Path) -> Self { let mut options = rocksdb::Options::default(); options.create_if_missing(true); @@ -261,7 +261,7 @@ pub struct EdgeStore { } impl EdgeStore { - pub fn open>(path: P, reversed: bool, compression: Compression) -> Self { + pub fn open(path: &Path, reversed: bool, compression: Compression) -> Self { let mut options = rocksdb::Options::default(); options.create_if_missing(true); @@ -295,12 +295,12 @@ impl EdgeStore { let ranges = match rocksdb::DB::open_cf_with_opts( &options, - path.as_ref().join("ranges"), + path.join("ranges"), [("nodes", options.clone()), ("labels", options.clone())], ) { Ok(db) => db, Err(_) => { - let mut ranges = rocksdb::DB::open(&options, path.as_ref().join("ranges")).unwrap(); + let mut ranges = rocksdb::DB::open(&options, path.join("ranges")).unwrap(); ranges.create_cf("nodes", &options).unwrap(); ranges.create_cf("labels", &options).unwrap(); @@ -313,7 +313,7 @@ impl EdgeStore { .read(true) .create(true) .write(true) - .open(path.as_ref().join("labels")) + .open(path.join("labels")) .unwrap(); let edge_labels = unsafe { Mmap::map(&edge_labels_file).unwrap() }; let edge_labels_len = edge_labels.len(); @@ -322,7 +322,7 @@ impl EdgeStore { .read(true) .create(true) .write(true) - .open(path.as_ref().join("nodes")) + .open(path.join("nodes")) .unwrap(); let edge_nodes = unsafe { Mmap::map(&edge_nodes_file).unwrap() }; let edge_nodes_len = edge_nodes.len(); @@ -330,7 +330,7 @@ impl EdgeStore { Self { reversed, ranges, - prefixes: PrefixDb::open(path.as_ref().join("prefixes")), + prefixes: PrefixDb::open(&path.join("prefixes")), _cache: cache, edge_labels, edge_labels_len, @@ -417,8 +417,8 @@ impl EdgeStore { /// Build a new edge store from a set of edges. The edges must be sorted by /// either the from or to node, depending on the value of `reversed`. - fn build>( - path: P, + fn build( + path: &Path, compression: Compression, reversed: bool, edges: impl Iterator>, @@ -594,14 +594,14 @@ impl EdgeStore { #[cfg(test)] mod tests { - use crate::webgraph::FullNodeID; + use crate::FullNodeID; use super::*; #[test] fn test_insert() { let kv: EdgeStoreWriter = EdgeStoreWriter::open( - crate::gen_temp_path().join("test-segment"), + &stdx::gen_temp_path().join("test-segment"), Compression::default(), false, ); @@ -635,7 +635,7 @@ mod tests { #[test] fn test_reversed() { let kv: EdgeStoreWriter = EdgeStoreWriter::open( - crate::gen_temp_path().join("test-segment"), + &stdx::gen_temp_path().join("test-segment"), Compression::default(), true, ); diff --git a/crates/webpage/Cargo.toml b/crates/webpage/Cargo.toml new file mode 100644 index 000000000..a2935cfc0 --- /dev/null +++ b/crates/webpage/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "webpage" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +bincode.workspace = true +chrono.workspace = true +csv.workspace = true +itertools.workspace = true +kuchiki.workspace = true +naive_bayes.workspace = true +once_cell.workspace = true +publicsuffix.workspace = true +regex.workspace = true +rust-stemmers = "1.2.0" +schema.workspace = true +serde.workspace = true +serde_json.workspace = true +simhash.workspace = true +stdx.workspace = true +tantivy.workspace = true +thiserror.workspace = true +tokenizer.workspace = true +tracing.workspace = true +url.workspace = true +utoipa.workspace = true +webgraph.workspace = true +whatlang.workspace = true + +[dev-dependencies] +maplit.workspace = true diff --git a/core/src/webpage/just_text.rs b/crates/webpage/src/just_text.rs similarity index 88% rename from core/src/webpage/just_text.rs rename to crates/webpage/src/just_text.rs index 0aebd97c0..158d94de6 100644 --- a/core/src/webpage/just_text.rs +++ b/crates/webpage/src/just_text.rs @@ -90,52 +90,52 @@ macro_rules! include_stopwords { static STOPWORDS: once_cell::sync::Lazy>> = once_cell::sync::Lazy::new(|| { include_stopwords!( - "../../stopwords/Afrikaans.txt" => Lang::Afr, - "../../stopwords/Arabic.txt" => Lang::Ara, - "../../stopwords/Armenian.txt" => Lang::Hye, - "../../stopwords/Azerbaijani.txt" => Lang::Aze, - "../../stopwords/Belarusian.txt" => Lang::Bel, - "../../stopwords/Bengali.txt" => Lang::Ben, - "../../stopwords/Bulgarian.txt" => Lang::Bul, - "../../stopwords/Catalan.txt" => Lang::Cat, - "../../stopwords/Croatian.txt" => Lang::Hrv, - "../../stopwords/Czech.txt" => Lang::Ces, - "../../stopwords/Danish.txt" => Lang::Dan, - "../../stopwords/Dutch.txt" => Lang::Nld, - "../../stopwords/English.txt" => Lang::Eng, - "../../stopwords/Esperanto.txt" => Lang::Epo, - "../../stopwords/Estonian.txt" => Lang::Est, - "../../stopwords/Finnish.txt" => Lang::Fin, - "../../stopwords/French.txt" => Lang::Fra, - "../../stopwords/Georgian.txt" => Lang::Kat, - "../../stopwords/German.txt" => Lang::Deu, - "../../stopwords/Greek.txt" => Lang::Ell, - "../../stopwords/Gujarati.txt" => Lang::Guj, - "../../stopwords/Hebrew.txt" => Lang::Heb, - "../../stopwords/Hindi.txt" => Lang::Hin, - "../../stopwords/Hungarian.txt" => Lang::Hun, - "../../stopwords/Indonesian.txt" => Lang::Ind, - "../../stopwords/Italian.txt" => Lang::Ita, - "../../stopwords/Javanese.txt" => Lang::Jav, - "../../stopwords/Kannada.txt" => Lang::Kan, - "../../stopwords/Korean.txt" => Lang::Kor, - "../../stopwords/Latin.txt" => Lang::Lat, - "../../stopwords/Latvian.txt" => Lang::Lav, - "../../stopwords/Lithuanian.txt" => Lang::Lit, - "../../stopwords/Macedonian.txt" => Lang::Mkd, - "../../stopwords/Malayalam.txt" => Lang::Mal, - "../../stopwords/Marathi.txt" => Lang::Mar, - "../../stopwords/Nepali.txt" => Lang::Nep, - "../../stopwords/Persian.txt" => Lang::Pes, - "../../stopwords/Polish.txt" => Lang::Pol, - "../../stopwords/Portuguese.txt" => Lang::Por, - "../../stopwords/Romanian.txt" => Lang::Ron, - "../../stopwords/Russian.txt" => Lang::Rus, - "../../stopwords/Serbian.txt" => Lang::Srp, - "../../stopwords/Slovak.txt" => Lang::Slk, - "../../stopwords/Slovenian.txt" => Lang::Slv, - "../../stopwords/Spanish.txt" => Lang::Spa, - "../../stopwords/Japanese.txt" => Lang::Jpn + "../../core/stopwords/Afrikaans.txt" => Lang::Afr, + "../../core/stopwords/Arabic.txt" => Lang::Ara, + "../../core/stopwords/Armenian.txt" => Lang::Hye, + "../../core/stopwords/Azerbaijani.txt" => Lang::Aze, + "../../core/stopwords/Belarusian.txt" => Lang::Bel, + "../../core/stopwords/Bengali.txt" => Lang::Ben, + "../../core/stopwords/Bulgarian.txt" => Lang::Bul, + "../../core/stopwords/Catalan.txt" => Lang::Cat, + "../../core/stopwords/Croatian.txt" => Lang::Hrv, + "../../core/stopwords/Czech.txt" => Lang::Ces, + "../../core/stopwords/Danish.txt" => Lang::Dan, + "../../core/stopwords/Dutch.txt" => Lang::Nld, + "../../core/stopwords/English.txt" => Lang::Eng, + "../../core/stopwords/Esperanto.txt" => Lang::Epo, + "../../core/stopwords/Estonian.txt" => Lang::Est, + "../../core/stopwords/Finnish.txt" => Lang::Fin, + "../../core/stopwords/French.txt" => Lang::Fra, + "../../core/stopwords/Georgian.txt" => Lang::Kat, + "../../core/stopwords/German.txt" => Lang::Deu, + "../../core/stopwords/Greek.txt" => Lang::Ell, + "../../core/stopwords/Gujarati.txt" => Lang::Guj, + "../../core/stopwords/Hebrew.txt" => Lang::Heb, + "../../core/stopwords/Hindi.txt" => Lang::Hin, + "../../core/stopwords/Hungarian.txt" => Lang::Hun, + "../../core/stopwords/Indonesian.txt" => Lang::Ind, + "../../core/stopwords/Italian.txt" => Lang::Ita, + "../../core/stopwords/Javanese.txt" => Lang::Jav, + "../../core/stopwords/Kannada.txt" => Lang::Kan, + "../../core/stopwords/Korean.txt" => Lang::Kor, + "../../core/stopwords/Latin.txt" => Lang::Lat, + "../../core/stopwords/Latvian.txt" => Lang::Lav, + "../../core/stopwords/Lithuanian.txt" => Lang::Lit, + "../../core/stopwords/Macedonian.txt" => Lang::Mkd, + "../../core/stopwords/Malayalam.txt" => Lang::Mal, + "../../core/stopwords/Marathi.txt" => Lang::Mar, + "../../core/stopwords/Nepali.txt" => Lang::Nep, + "../../core/stopwords/Persian.txt" => Lang::Pes, + "../../core/stopwords/Polish.txt" => Lang::Pol, + "../../core/stopwords/Portuguese.txt" => Lang::Por, + "../../core/stopwords/Romanian.txt" => Lang::Ron, + "../../core/stopwords/Russian.txt" => Lang::Rus, + "../../core/stopwords/Serbian.txt" => Lang::Srp, + "../../core/stopwords/Slovak.txt" => Lang::Slk, + "../../core/stopwords/Slovenian.txt" => Lang::Slv, + "../../core/stopwords/Spanish.txt" => Lang::Spa, + "../../core/stopwords/Japanese.txt" => Lang::Jpn ) }); diff --git a/core/src/webpage/mod.rs b/crates/webpage/src/lib.rs similarity index 96% rename from core/src/webpage/mod.rs rename to crates/webpage/src/lib.rs index 9487df6be..ee8f6138f 100644 --- a/core/src/webpage/mod.rs +++ b/crates/webpage/src/lib.rs @@ -13,25 +13,19 @@ // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use crate::{ - ceil_char_boundary, - enum_map::EnumSet, - prehashed::hash, - schema::{FastField, TextField}, - simhash, split_u128, tokenizer, - webgraph::NodeID, - Error, Result, -}; use chrono::{DateTime, FixedOffset, Utc}; use itertools::Itertools; use kuchiki::{iter::NodeEdge, traits::TendrilSink, NodeRef}; use regex::Regex; +use schema::{FastField, Field, TextField, ALL_FIELDS, FLOAT_SCALING}; use std::{collections::HashMap, panic, str::FromStr}; +use stdx::{enum_map::EnumSet, prehashed::hash}; use tantivy::{ tokenizer::{PreTokenizedString, Tokenizer}, TantivyDocument, }; use url::Url; +use webgraph::NodeID; use whatlang::Lang; mod just_text; @@ -40,14 +34,60 @@ pub mod safety_classifier; pub mod schema_org; pub mod url_ext; -use crate::schema::{Field, ALL_FIELDS, FLOAT_SCALING}; - use self::{ just_text::{JustText, Paragraph}, region::Region, url_ext::UrlExt, }; +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Encountered an empty required field ({0}) when converting to tantivy")] + EmptyField(&'static str), + + // #[error("Parsing error")] + // ParsingError(String), + + // #[error("Failed to download warc files after all retries")] + // DownloadFailed, + + // #[error("Query cannot be completely empty")] + // EmptyQuery, + #[error("Unknown region")] + UnknownRegion, + + // #[error("Unknown CLI option")] + // UnknownCLIOption, + + // #[error("The stackoverflow schema was not structured as expected")] + // InvalidStackoverflowSchema, + + // #[error("Internal error")] + // InternalError(String), + #[error("IO error")] + Io(#[from] std::io::Error), + + #[error("JSON error")] + Json(#[from] serde_json::Error), + + #[error("URL parse error")] + Url(#[from] url::ParseError), + + #[error("CSV error")] + Csv(#[from] csv::Error), + + #[error("Bincode error")] + Bincode(#[from] bincode::Error), + + #[error("Unknown webpage robots meta tag")] + UnknownRobotsMetaTag, + + #[error("Unknown microformat")] + UnknownMicroformat, +} + +pub type Result = std::result::Result; + pub static URL_REGEX: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { Regex::new(r"(((http|ftp|https):/{2})+(([0-9a-z_-]+\.)+(aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx|cy|cz|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mn|mn|mo|mp|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|nom|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ra|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw|arpa)(:[0-9]+)?((/([~0-9a-zA-Z\#\+%@\./_-]+))?(\?[0-9a-zA-Z\+%@/&\[\];=_-]+)?)?))\b").unwrap() }); @@ -130,7 +170,8 @@ pub struct Webpage { } impl Webpage { - #[cfg(test)] + // TODO: I needed to make this for all targets due to #[cfg(test)] not being exported + // #[cfg(test)] pub fn new(html: &str, url: &str) -> Result { let html = Html::parse(html, url)?; @@ -238,7 +279,7 @@ impl Webpage { match &self.node_id { Some(node_id) => { - let [node_id1, node_id2] = split_u128(node_id.bit_128()); + let [node_id1, node_id2] = stdx::split_u128(node_id.bit_128()); doc.add_u64( schema .get_field(Field::Fast(FastField::HostNodeID1).name()) @@ -291,13 +332,13 @@ enum RobotsMeta { } impl FromStr for RobotsMeta { - type Err = anyhow::Error; + type Err = Error; fn from_str(s: &str) -> Result { match s { "noindex" => Ok(RobotsMeta::NoIndex), "nofollow" => Ok(RobotsMeta::NoFollow), - _ => Err(Error::UnknownRobotsMetaTag.into()), + _ => Err(Error::UnknownRobotsMetaTag), } } } @@ -348,7 +389,7 @@ impl From for usize { } impl TryFrom for Microformat { - type Error = anyhow::Error; + type Error = Error; fn try_from(value: usize) -> Result { match value { @@ -358,7 +399,7 @@ impl TryFrom for Microformat { 3 => Ok(Microformat::HRecipe), 4 => Ok(Microformat::HReview), 5 => Ok(Microformat::HProduct), - _ => Err(anyhow::anyhow!("Unknown microformat")), + _ => Err(Error::UnknownMicroformat), } } } @@ -391,7 +432,8 @@ impl Html { Ok(html) } - #[cfg(test)] + // TODO: I needed to make this for all targets due to #[cfg(test)] not being exported + // #[cfg(test)] pub fn set_clean_text(&mut self, text: String) { self.clean_text = Some(text); } @@ -849,7 +891,7 @@ impl Html { let title = self.title(); if title.is_none() { - return Err(Error::EmptyField("title").into()); + return Err(Error::EmptyField("title")); } let title = title.unwrap(); @@ -860,7 +902,7 @@ impl Html { let all_text = self.all_text(); if all_text.is_none() { - return Err(Error::EmptyField("all body").into()); + return Err(Error::EmptyField("all body")); } let all_text = all_text.unwrap(); @@ -955,7 +997,7 @@ impl Html { .unwrap_or_default() .find('.') .map(|index| { - &domain.text[..ceil_char_boundary(&domain.text, index).min(domain.text.len())] + &domain.text[..stdx::ceil_char_boundary(&domain.text, index).min(domain.text.len())] }) .unwrap_or_default() .to_string(); @@ -987,16 +1029,16 @@ impl Html { }, }; - let site_hash = split_u128(hash(self.url().host_str().unwrap_or_default()).0); + let site_hash = stdx::split_u128(hash(self.url().host_str().unwrap_or_default()).0); let mut url_without_query = self.url().clone(); url_without_query.set_query(None); - let url_without_query_hash = split_u128(hash(url_without_query.as_str()).0); - let url_hash = split_u128(hash(self.url().as_str()).0); + let url_without_query_hash = stdx::split_u128(hash(url_without_query.as_str()).0); + let url_hash = stdx::split_u128(hash(self.url().as_str()).0); - let domain_hash = split_u128(hash(self.url().root_domain().unwrap_or_default()).0); - let title_hash = split_u128(hash(self.title().unwrap_or_default()).0); + let domain_hash = stdx::split_u128(hash(self.url().root_domain().unwrap_or_default()).0); + let title_hash = stdx::split_u128(hash(self.title().unwrap_or_default()).0); for field in &ALL_FIELDS { let tantivy_field = schema @@ -1577,7 +1619,8 @@ pub type Meta = HashMap; mod tests { // TODO: make test macro to test both dom parsers - use crate::{schema::create_schema, webpage::url_ext::UrlExt}; + use crate::url_ext::UrlExt; + use schema::create_schema; use super::*; @@ -1758,7 +1801,7 @@ mod tests { #[test] fn hard_parsing() { let webpage = Html::parse( - include_str!("../../testcases/parsing/yasudaya.html"), + include_str!("../../core/testcases/parsing/yasudaya.html"), "https://example.com", ) .unwrap(); @@ -1770,7 +1813,7 @@ mod tests { assert!(!webpage.all_text().unwrap().is_empty()); let webpage = Html::parse( - include_str!("../../testcases/parsing/5390001.html"), + include_str!("../../core/testcases/parsing/5390001.html"), "https://example.com", ) .unwrap(); @@ -1782,7 +1825,7 @@ mod tests { assert!(!webpage.all_text().unwrap().is_empty()); let webpage = Html::parse( - include_str!("../../testcases/parsing/77p2p-7.live-105.html"), + include_str!("../../core/testcases/parsing/77p2p-7.live-105.html"), "https://example.com", ) .unwrap(); @@ -1797,7 +1840,7 @@ mod tests { #[test] fn reddit_comments() { let webpage = Html::parse( - include_str!("../../testcases/parsing/reddit.html"), + include_str!("../../core/testcases/parsing/reddit.html"), "https://reddit.com/", ) .unwrap(); @@ -1813,7 +1856,7 @@ mod tests { #[test] fn out_of_bounds_str() { let webpage = Html::parse( - include_str!("../../testcases/parsing/byte_index_out_of_bounds.html"), + include_str!("../../core/testcases/parsing/byte_index_out_of_bounds.html"), "https://example.com", ) .unwrap(); @@ -2275,7 +2318,8 @@ mod tests { #[test] fn stackoverflow_question_has_clean_text() { - let stackoverflow = include_str!("../../testcases/schema_org/stackoverflow_with_code.html"); + let stackoverflow = + include_str!("../../core/testcases/schema_org/stackoverflow_with_code.html"); let html = Html::parse(stackoverflow, "https://www.example.com").unwrap(); assert!(html.clean_text().is_some()); diff --git a/core/src/webpage/region.rs b/crates/webpage/src/region.rs similarity index 89% rename from core/src/webpage/region.rs rename to crates/webpage/src/region.rs index ef4c10fa2..998f6f374 100644 --- a/core/src/webpage/region.rs +++ b/crates/webpage/src/region.rs @@ -89,7 +89,7 @@ impl Region { "ger" => Ok(Region::Germany), "spa" => Ok(Region::Spain), "us" => Ok(Region::US), - _ => Err(Error::UnknownRegion.into()), + _ => Err(Error::UnknownRegion), } } @@ -101,9 +101,9 @@ impl Region { whatlang::Lang::Fra => Ok(Region::France), whatlang::Lang::Deu => Ok(Region::Germany), whatlang::Lang::Dan => Ok(Region::Denmark), - _ => Err(Error::UnknownRegion.into()), + _ => Err(Error::UnknownRegion), }, - None => Err(Error::UnknownRegion.into()), + None => Err(Error::UnknownRegion), } } @@ -132,16 +132,16 @@ pub struct RegionCount { } impl RegionCount { - pub fn open>(path: P) -> Self { - let map: HashMap = if !path.as_ref().exists() { - if let Some(parent) = path.as_ref().parent() { + pub fn open(path: &Path) -> Self { + let map: HashMap = if !path.exists() { + if let Some(parent) = path.parent() { std::fs::create_dir_all(parent).unwrap(); } - File::create(path.as_ref()).unwrap(); + File::create(path).unwrap(); HashMap::new() } else { - let json = std::fs::read_to_string(path.as_ref()).unwrap_or_default(); + let json = std::fs::read_to_string(path).unwrap_or_default(); serde_json::from_str(&json).unwrap_or_else(|_| HashMap::new()) }; @@ -161,7 +161,7 @@ impl RegionCount { total_counts: map.values().sum(), map, fast_count, - path: path.as_ref().to_str().unwrap().to_string(), + path: path.to_str().unwrap().to_string(), } } @@ -215,19 +215,19 @@ impl RegionCount { #[cfg(test)] mod tests { - use crate::gen_temp_path; + use stdx::gen_temp_path; use super::*; #[test] fn simple() { - let mut a = RegionCount::open(gen_temp_path().join("region_count.json")); + let mut a = RegionCount::open(&gen_temp_path().join("region_count.json")); a.increment(&Region::Denmark); a.increment(&Region::Denmark); a.increment(&Region::US); - let mut b = RegionCount::open(gen_temp_path().join("region_count.json")); + let mut b = RegionCount::open(&gen_temp_path().join("region_count.json")); b.increment(&Region::US); b.increment(&Region::Germany); diff --git a/core/src/webpage/safety_classifier.rs b/crates/webpage/src/safety_classifier.rs similarity index 92% rename from core/src/webpage/safety_classifier.rs rename to crates/webpage/src/safety_classifier.rs index d4de713c8..98387f5b5 100644 --- a/core/src/webpage/safety_classifier.rs +++ b/crates/webpage/src/safety_classifier.rs @@ -19,7 +19,6 @@ use std::path::Path; use itertools::Itertools; -use crate::naive_bayes; use crate::Result; const MAX_NUM_WORDS: usize = 100; @@ -61,7 +60,7 @@ pub struct Datapoint { pub text: String, } -pub fn load_dataset>(path: P) -> Result> { +pub fn load_dataset(path: &Path) -> Result> { let mut datapoints = Vec::new(); let mut reader = csv::Reader::from_path(path)?; for result in reader.deserialize() { @@ -78,7 +77,7 @@ fn normalize(text: &str) -> String { .to_lowercase() } -pub fn page_text(page: &crate::webpage::Webpage) -> String { +pub fn page_text(page: &crate::Webpage) -> String { page.html.title().unwrap_or_default() + " " + page.html.clean_text().cloned().unwrap_or_default().as_str() @@ -121,7 +120,7 @@ impl Model { self.pipeline.predict(&text) } - pub fn predict(&self, page: &crate::webpage::Webpage) -> naive_bayes::Prediction

Jane Doe Photo of Jane Doe - + Professor
@@ -665,10 +665,10 @@ mod tests { (425) 123-4567 jane-doe@xyz.edu - + Jane's home page: - + Graduate students: Alice Jones diff --git a/core/src/webpage/schema_org/mod.rs b/crates/webpage/src/schema_org/mod.rs similarity index 97% rename from core/src/webpage/schema_org/mod.rs rename to crates/webpage/src/schema_org/mod.rs index 036a4aa76..7745a2b50 100644 --- a/core/src/webpage/schema_org/mod.rs +++ b/crates/webpage/src/schema_org/mod.rs @@ -18,8 +18,8 @@ use std::collections::HashMap; use kuchiki::NodeRef; use serde::{Deserialize, Serialize}; +use tokenizer::FlattenedJson; -use crate::tokenizer::FlattenedJson; use crate::Result; mod json_ld; @@ -31,14 +31,14 @@ pub enum Property { Item(Item), } impl Property { - pub(crate) fn try_into_string(&self) -> Option { + pub fn try_into_string(&self) -> Option { match self { Property::String(s) => Some(s.clone()), Property::Item(_) => None, } } - pub(crate) fn try_into_item(&self) -> Option { + pub fn try_into_item(&self) -> Option { match self { Property::String(_) => None, Property::Item(it) => Some(it.clone()), @@ -229,7 +229,7 @@ pub(crate) fn flattened_json(schemas: Vec) -> Result { .into_iter() .map(|item| item.into_single_map()) .collect(); - FlattenedJson::new(&single_maps) + Ok(FlattenedJson::new(&single_maps)?) } #[cfg(test)] @@ -275,7 +275,7 @@ mod tests { #[test] fn stackoverflow_question() { - let html = include_str!("../../../testcases/schema_org/stackoverflow.html"); + let html = include_str!("../../../core/testcases/schema_org/stackoverflow.html"); let root = kuchiki::parse_html().one(html); let res = microdata::parse_schema(root); @@ -346,7 +346,7 @@ mod tests { #[test] fn stackoverflow_question_with_code() { - let html = include_str!("../../../testcases/schema_org/stackoverflow_with_code.html"); + let html = include_str!("../../../core/testcases/schema_org/stackoverflow_with_code.html"); let root = kuchiki::parse_html().one(html); let res = microdata::parse_schema(root); @@ -439,7 +439,7 @@ mod tests { #[test] fn recipe() { - let html = include_str!("../../../testcases/schema_org/recipe.html"); + let html = include_str!("../../../core/testcases/schema_org/recipe.html"); let root = kuchiki::parse_html().one(html); let res = microdata::parse_schema(root); diff --git a/core/src/webpage/url_ext.rs b/crates/webpage/src/url_ext.rs similarity index 96% rename from core/src/webpage/url_ext.rs rename to crates/webpage/src/url_ext.rs index f166b289c..0bd17af49 100644 --- a/core/src/webpage/url_ext.rs +++ b/crates/webpage/src/url_ext.rs @@ -17,12 +17,12 @@ use publicsuffix::Psl; static FULL_LIST: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - include_str!("../../public_suffix_list.dat") + include_str!("../../core/public_suffix_list.dat") .parse() .expect("Failed to parse public suffix list") }); static ICANN_LIST: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - include_str!("../../public_icann_suffix.dat") + include_str!("../../core/public_icann_suffix.dat") .parse() .expect("Failed to parse public icann suffix list") }); diff --git a/justfile b/justfile index 77b677850..7bbb12555 100644 --- a/justfile +++ b/justfile @@ -33,8 +33,8 @@ export STRACT_CARGO_ARGS := env_var_or_default("STRACT_CARGO_ARGS", "") ./scripts/export_fact_model @configure *ARGS: - just setup {{ARGS}} - just prepare_models + # just setup {{ARGS}} + # just prepare_models RUST_LOG="none,stract=info" just cargo run --release --all-features -- configure {{ARGS}} @setup_python_env: @@ -46,3 +46,29 @@ export STRACT_CARGO_ARGS := env_var_or_default("STRACT_CARGO_ARGS", "") @cargo *ARGS: LIBTORCH="{{justfile_directory()}}/libtorch" LD_LIBRARY_PATH="{{justfile_directory()}}/libtorch/lib" DYLD_LIBRARY_PATH="{{justfile_directory()}}/libtorch/lib" cargo {{ARGS}} + +@bench-compile A B: + #!/bin/bash + BRANCH=$(git rev-parse --abbrev-ref HEAD) + hyperfine --show-output -w 2 --export-markdown bench-compile-{{A}}-vs-{{B}}.md \ + -p "git switch {{A}} && touch core/src/lib.rs && sleep 1" -n "{{A}}" "cargo build" \ + -p "git switch {{B}} && touch crates/core/src/lib.rs && sleep 1" -n "{{B}}" "cargo build" + +@bench-compile-release A B: + #!/bin/bash + BRANCH=$(git rev-parse --abbrev-ref HEAD) + hyperfine --show-output -w 2 --export-markdown bench-compile-release-{{A}}-vs-{{B}}.md \ + -p "git switch {{A}} && touch core/src/lib.rs && sleep 1" -n "{{A}}" "cargo build --release" \ + -p "git switch {{B}} && touch crates/core/src/lib.rs && sleep 1" -n "{{B}}" "cargo build --release" + +@crateify path name: + cargo new crates/{{name}} --lib + mv {{path}} crates/{{name}}/src/lib.rs + echo {{name}} = { path = '"./crates/{{name}}"' } + echo "cargo add -p stract-core {{name}}" + +@librarify path: + cargo new lib/{{file_stem(path)}} --lib + mv {{path}} lib/{{file_stem(path)}}/src/lib.rs + echo {{file_stem(path)}} = { path = '"./lib/{{file_stem(path)}}"' } + echo "cargo add -p stract-core {{file_stem(path)}}" diff --git a/kuchiki/Cargo.toml b/lib/kuchiki/Cargo.toml similarity index 100% rename from kuchiki/Cargo.toml rename to lib/kuchiki/Cargo.toml diff --git a/kuchiki/LICENSE b/lib/kuchiki/LICENSE similarity index 100% rename from kuchiki/LICENSE rename to lib/kuchiki/LICENSE diff --git a/kuchiki/README.md b/lib/kuchiki/README.md similarity index 100% rename from kuchiki/README.md rename to lib/kuchiki/README.md diff --git a/kuchiki/docs/.nojekyll b/lib/kuchiki/docs/.nojekyll similarity index 100% rename from kuchiki/docs/.nojekyll rename to lib/kuchiki/docs/.nojekyll diff --git a/kuchiki/docs/404.html b/lib/kuchiki/docs/404.html similarity index 100% rename from kuchiki/docs/404.html rename to lib/kuchiki/docs/404.html diff --git a/kuchiki/docs/index.html b/lib/kuchiki/docs/index.html similarity index 100% rename from kuchiki/docs/index.html rename to lib/kuchiki/docs/index.html diff --git a/kuchiki/examples/find_matches.rs b/lib/kuchiki/examples/find_matches.rs similarity index 100% rename from kuchiki/examples/find_matches.rs rename to lib/kuchiki/examples/find_matches.rs diff --git a/kuchiki/examples/stack-overflow.rs b/lib/kuchiki/examples/stack-overflow.rs similarity index 100% rename from kuchiki/examples/stack-overflow.rs rename to lib/kuchiki/examples/stack-overflow.rs diff --git a/kuchiki/src/attributes.rs b/lib/kuchiki/src/attributes.rs similarity index 100% rename from kuchiki/src/attributes.rs rename to lib/kuchiki/src/attributes.rs diff --git a/kuchiki/src/cell_extras.rs b/lib/kuchiki/src/cell_extras.rs similarity index 100% rename from kuchiki/src/cell_extras.rs rename to lib/kuchiki/src/cell_extras.rs diff --git a/kuchiki/src/iter.rs b/lib/kuchiki/src/iter.rs similarity index 100% rename from kuchiki/src/iter.rs rename to lib/kuchiki/src/iter.rs diff --git a/kuchiki/src/lib.rs b/lib/kuchiki/src/lib.rs similarity index 100% rename from kuchiki/src/lib.rs rename to lib/kuchiki/src/lib.rs diff --git a/kuchiki/src/node_data_ref.rs b/lib/kuchiki/src/node_data_ref.rs similarity index 100% rename from kuchiki/src/node_data_ref.rs rename to lib/kuchiki/src/node_data_ref.rs diff --git a/kuchiki/src/parser.rs b/lib/kuchiki/src/parser.rs similarity index 100% rename from kuchiki/src/parser.rs rename to lib/kuchiki/src/parser.rs diff --git a/kuchiki/src/select.rs b/lib/kuchiki/src/select.rs similarity index 100% rename from kuchiki/src/select.rs rename to lib/kuchiki/src/select.rs diff --git a/kuchiki/src/serializer.rs b/lib/kuchiki/src/serializer.rs similarity index 100% rename from kuchiki/src/serializer.rs rename to lib/kuchiki/src/serializer.rs diff --git a/kuchiki/src/tests.rs b/lib/kuchiki/src/tests.rs similarity index 100% rename from kuchiki/src/tests.rs rename to lib/kuchiki/src/tests.rs diff --git a/kuchiki/src/tree.rs b/lib/kuchiki/src/tree.rs similarity index 100% rename from kuchiki/src/tree.rs rename to lib/kuchiki/src/tree.rs diff --git a/kuchiki/test_data/foo.html b/lib/kuchiki/test_data/foo.html similarity index 100% rename from kuchiki/test_data/foo.html rename to lib/kuchiki/test_data/foo.html diff --git a/lib/sonic/Cargo.toml b/lib/sonic/Cargo.toml new file mode 100644 index 000000000..521d9a243 --- /dev/null +++ b/lib/sonic/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "sonic" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +async-trait.workspace = true +bincode.workspace = true +bytemuck.workspace = true +serde = { workspace = true, features = ["derive"] } +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true + +[dev-dependencies] +proptest.workspace = true +proptest-derive.workspace = true diff --git a/core/src/distributed/sonic/mod.rs b/lib/sonic/src/lib.rs similarity index 98% rename from core/src/distributed/sonic/mod.rs rename to lib/sonic/src/lib.rs index 1519e56d9..6d2b20cdd 100644 --- a/core/src/distributed/sonic/mod.rs +++ b/lib/sonic/src/lib.rs @@ -24,7 +24,7 @@ use tokio::{ net::{TcpListener, TcpStream, ToSocketAddrs}, }; -pub(crate) type Result = std::result::Result; +pub type Result = std::result::Result; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -126,7 +126,7 @@ struct Header { } pub struct Server { - pub(super) listener: TcpListener, + listener: TcpListener, marker: PhantomData<(Req, Res)>, } diff --git a/core/src/distributed/sonic/service.rs b/lib/sonic/src/service.rs similarity index 91% rename from core/src/distributed/sonic/service.rs rename to lib/sonic/src/service.rs index 15fa1d4e2..03f4e5d3b 100644 --- a/core/src/distributed/sonic/service.rs +++ b/lib/sonic/src/service.rs @@ -137,8 +137,6 @@ macro_rules! sonic_service { use super::{$service, $($req),*}; - use $crate::distributed::sonic; - #[derive(Debug, Clone, ::serde::Deserialize)] pub enum Request { $($req($req),)* @@ -149,14 +147,14 @@ macro_rules! sonic_service { } #[derive(::serde::Serialize, ::serde::Deserialize)] pub enum Response { - $($req(<$req as sonic::service::Message<$service>>::Response),)* + $($req(<$req as $crate::service::Message<$service>>::Response),)* } $( - impl sonic::service::Wrapper<$service> for $req { + impl $crate::service::Wrapper<$service> for $req { fn wrap_request_ref(req: &Self) -> RequestRef { RequestRef::$req(req) } - fn unwrap_response(res: <$service as sonic::service::Service>::Response) -> Option { + fn unwrap_response(res: <$service as $crate::service::Service>::Response) -> Option { #[allow(irrefutable_let_patterns)] if let Response::$req(value) = res { Some(value) @@ -167,22 +165,22 @@ macro_rules! sonic_service { } )* #[async_trait::async_trait] - impl sonic::service::Service for $service { + impl $crate::service::Service for $service { type Request = Request; type RequestRef<'a> = RequestRef<'a>; type Response = Response; - async fn handle(req: Request, server: &Self) -> sonic::Result { + async fn handle(req: Request, server: &Self) -> $crate::Result { match req { $( - Request::$req(value) => Ok(Response::$req(sonic::service::Message::handle(value, server).await?)), + Request::$req(value) => Ok(Response::$req($crate::service::Message::handle(value, server).await?)), )* } } } impl $service { - pub async fn bind(self, addr: impl ::tokio::net::ToSocketAddrs) -> sonic::Result> { - sonic::service::Server::bind(self, addr).await + pub async fn bind(self, addr: impl ::tokio::net::ToSocketAddrs) -> $crate::Result<$crate::service::Server> { + $crate::service::Server::bind(self, addr).await } } } @@ -193,10 +191,9 @@ macro_rules! sonic_service { mod tests { use proptest::prelude::*; - use std::{marker::PhantomData, net::SocketAddr, sync::atomic::AtomicI32}; + use std::{future::Future, marker::PhantomData, net::SocketAddr, sync::atomic::AtomicI32}; use super::{Server, Service, Wrapper}; - use futures::Future; struct ConnectionBuilder { addr: SocketAddr, @@ -258,8 +255,6 @@ mod tests { use proptest_derive::Arbitrary; use serde::{Deserialize, Serialize}; - use crate::distributed::sonic; - use super::super::Message; pub struct CounterService { @@ -279,7 +274,7 @@ mod tests { impl Message for Change { type Response = i32; - async fn handle(self, server: &CounterService) -> sonic::Result { + async fn handle(self, server: &CounterService) -> crate::Result { let prev = server .counter .fetch_add(self.amount, std::sync::atomic::Ordering::SeqCst); @@ -291,7 +286,7 @@ mod tests { impl Message for Reset { type Response = (); - async fn handle(self, server: &CounterService) -> sonic::Result { + async fn handle(self, server: &CounterService) -> crate::Result { server.counter.store(0, std::sync::atomic::Ordering::SeqCst); Ok(()) } diff --git a/lib/tokenizer/Cargo.toml b/lib/tokenizer/Cargo.toml new file mode 100644 index 000000000..2526ab23a --- /dev/null +++ b/lib/tokenizer/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "tokenizer" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +itertools.workspace = true +logos.workspace = true +serde.workspace = true +serde_json.workspace = true +stdx.workspace = true +tantivy.workspace = true +url.workspace = true +whatlang.workspace = true diff --git a/core/src/tokenizer/add_space_last.rs b/lib/tokenizer/src/add_space_last.rs similarity index 100% rename from core/src/tokenizer/add_space_last.rs rename to lib/tokenizer/src/add_space_last.rs diff --git a/core/src/tokenizer/mod.rs b/lib/tokenizer/src/lib.rs similarity index 98% rename from core/src/tokenizer/mod.rs rename to lib/tokenizer/src/lib.rs index ad31a6fd3..07ca632b5 100644 --- a/core/src/tokenizer/mod.rs +++ b/lib/tokenizer/src/lib.rs @@ -23,8 +23,6 @@ use tantivy::tokenizer::{ use whatlang::Lang; -use crate::{ceil_char_boundary, floor_char_boundary}; - use self::{add_space_last::AddSpaceLast, split_preserve::StrSplitPreserve}; mod add_space_last; @@ -478,7 +476,7 @@ fn flatten(val: serde_json::Value) -> Vec { } impl FlattenedJson { - pub fn new(value: &T) -> crate::Result + pub fn new(value: &T) -> serde_json::Result where T: serde::Serialize, { @@ -569,9 +567,11 @@ impl<'a> tantivy::tokenizer::TokenStream for JsonFieldTokenStream<'a> { self.token.offset_from -= 1; self.token.offset_to += 1; - self.token.offset_from = floor_char_boundary(self.text, self.token.offset_from); + self.token.offset_from = + stdx::floor_char_boundary(self.text, self.token.offset_from); self.token.offset_to = - ceil_char_boundary(self.text, self.token.offset_to).min(self.text.len()); + stdx::ceil_char_boundary(self.text, self.token.offset_to) + .min(self.text.len()); } self.token diff --git a/core/src/tokenizer/split_preserve.rs b/lib/tokenizer/src/split_preserve.rs similarity index 100% rename from core/src/tokenizer/split_preserve.rs rename to lib/tokenizer/src/split_preserve.rs diff --git a/optics-lsp/Cargo.lock b/optics-lsp/Cargo.lock index be4d1259c..5d0986e67 100644 --- a/optics-lsp/Cargo.lock +++ b/optics-lsp/Cargo.lock @@ -246,15 +246,6 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "1.0.6" @@ -281,7 +272,7 @@ dependencies = [ "diff", "ena", "is-terminal", - "itertools 0.10.5", + "itertools", "lalrpop-util", "petgraph", "pico-args", @@ -400,7 +391,6 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" name = "optics" version = "0.1.0" dependencies = [ - "itertools 0.11.0", "lalrpop", "lalrpop-util", "logos", diff --git a/optics-lsp/Cargo.toml b/optics-lsp/Cargo.toml index 780bbb2c7..805864ec6 100644 --- a/optics-lsp/Cargo.toml +++ b/optics-lsp/Cargo.toml @@ -15,6 +15,6 @@ serde = { version = "1.0.137", features = ["rc", "derive"] } serde-wasm-bindgen = "0.4.5" thiserror = "1.0.31" -optics = { path = "../optics" } +optics = { path = "../crates/optics" } [workspace] diff --git a/optics/build.rs b/optics/build.rs deleted file mode 100644 index ca5c2836d..000000000 --- a/optics/build.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - lalrpop::process_root().unwrap(); -} diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml new file mode 100644 index 000000000..37622010e --- /dev/null +++ b/xtask/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "xtask" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow.workspace = true +heck = "0.4.1" +itertools.workspace = true +xshell = "0.2.5" diff --git a/xtask/src/main.rs b/xtask/src/main.rs new file mode 100644 index 000000000..f07612f32 --- /dev/null +++ b/xtask/src/main.rs @@ -0,0 +1,60 @@ +use std::{cmp::Ordering, collections::HashSet}; + +use anyhow::Result; +use heck::ToSnakeCase; +use itertools::Itertools; +use xshell::{cmd, Shell}; + +#[derive(Debug)] +struct DepTree { + name: String, + depth: u32, + deps: Vec, +} + +fn main() -> Result<()> { + let sh = Shell::new()?; + + let tree = cmd!(sh, "cargo tree --prefix depth").read()?; + + let lines = tree + .lines() + .filter_map(|l| { + let s = l.split_once("/stract/")?.0.split_once(' ')?.0; + Some((s[0..1].parse::().unwrap(), s[1..].to_string())) + }) + .collect_vec(); + + let mut path: Vec = Vec::new(); + + let mut nodes = HashSet::new(); + let mut edges = HashSet::new(); + + for (depth, name) in lines { + path.truncate(depth); + nodes.insert(name.clone()); + if let Some(parent) = path.last() { + edges.insert((parent.clone(), name.clone())); + } + path.push(name); + } + + println!("digraph G {{"); + for n in nodes { + if n == "xtask" { + continue; + } + println!(" {}[label={n:?}]", n.to_snake_case()); + } + for (a, b) in edges.iter().sorted() { + println!(" {} -> {}", a.to_snake_case(), b.to_snake_case()); + } + println!("}}"); + + // println!("flowchart TD"); + // for (a, b) in edges.iter().sorted() { + // println!(" {} --> {}", a.to_snake_case(), b.to_snake_case()); + // } + + Ok(()) +}