diff --git a/apps/labrinth/.env.local b/apps/labrinth/.env.local index a22b73fda5..8f6c29d907 100644 --- a/apps/labrinth/.env.local +++ b/apps/labrinth/.env.local @@ -16,7 +16,7 @@ DATABASE_URL=postgresql://labrinth:labrinth@localhost/labrinth DATABASE_MIN_CONNECTIONS=0 DATABASE_MAX_CONNECTIONS=16 -SEARCH_BACKEND=meilisearch +SEARCH_BACKEND=typesense # Meilisearch configuration MEILISEARCH_READ_ADDR=http://localhost:7700 diff --git a/apps/labrinth/src/search/backend/typesense/mod.rs b/apps/labrinth/src/search/backend/typesense/mod.rs index dec7923e2c..e7f302b8c7 100644 --- a/apps/labrinth/src/search/backend/typesense/mod.rs +++ b/apps/labrinth/src/search/backend/typesense/mod.rs @@ -83,10 +83,16 @@ pub struct RequestConfig { pub prioritize_exact_match: bool, #[serde(default = "default_prioritize_num_matching_fields")] pub prioritize_num_matching_fields: bool, + #[serde(default = "default_prioritize_token_positions")] + pub prioritize_token_positions: bool, + #[serde(default = "default_drop_tokens_threshold")] + pub drop_tokens_threshold: usize, #[serde(default)] pub text_match_type: TextMatchType, #[serde(default)] pub bucketing: Bucketing, + #[serde(default = "default_max_candidates")] + pub max_candidates: usize, } impl Default for RequestConfig { @@ -98,32 +104,38 @@ impl Default for RequestConfig { prioritize_exact_match: default_prioritize_exact_match(), prioritize_num_matching_fields: default_prioritize_num_matching_fields(), + prioritize_token_positions: default_prioritize_token_positions(), + drop_tokens_threshold: default_drop_tokens_threshold(), text_match_type: TextMatchType::default(), bucketing: Bucketing::default(), + max_candidates: default_max_candidates(), } } } fn default_query_by() -> Vec { - [ - "name", - "indexed_name", - "slug", - "author", - "indexed_author", - "summary", - ] - .into_iter() - .map(str::to_string) - .collect() + // [ + // "name", + // "indexed_name", + // "slug", + // "author", + // "indexed_author", + // "summary", + // ] + ["name", "indexed_name", "slug", "author", "indexed_author"] + .into_iter() + .map(str::to_string) + .collect() } fn default_query_by_weights() -> Vec { - vec![15, 15, 10, 3, 3, 1] + // vec![15, 15, 10, 3, 3, 1] + vec![15, 15, 10, 3, 3] } fn default_prefix() -> Vec { - vec![true, true, true, true, true, true] + // vec![true, true, true, true, true, true] + vec![true, true, true, true, true] } const fn default_prioritize_exact_match() -> bool { @@ -134,6 +146,20 @@ const fn default_prioritize_num_matching_fields() -> bool { false } +const fn default_prioritize_token_positions() -> bool { + // true + false +} + +const fn default_drop_tokens_threshold() -> usize { + // 0 + 1 +} + +const fn default_max_candidates() -> usize { + 8 +} + impl TypesenseConfig { pub fn new(meta_namespace: Option) -> Self { Self { @@ -696,6 +722,14 @@ impl SearchBackend for Typesense { .prioritize_num_matching_fields .to_string(), ), + ( + "prioritize_token_positions", + info.typesense_config.prioritize_token_positions.to_string(), + ), + ( + "drop_tokens_threshold", + info.typesense_config.drop_tokens_threshold.to_string(), + ), ( "text_match_type", info.typesense_config.text_match_type.as_str().to_string(), @@ -707,6 +741,10 @@ impl SearchBackend for Typesense { ("group_limit", "1".to_string()), ("facet_by", "project_id".to_string()), ("max_facet_values", "0".to_string()), + ( + "max_candidates", + info.typesense_config.max_candidates.to_string(), + ), ]; if let Some(query_by_weights) = Self::query_by_weights(&info.typesense_config) diff --git a/scripts/import-projects.py b/scripts/import-projects.py new file mode 100755 index 0000000000..124bef8a8e --- /dev/null +++ b/scripts/import-projects.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +""" +Search projects on api.modrinth.com and import results into the local database +with correct author names. + +Modes: + search - Import top N results for a text query + top - Import the top N projects by total downloads (for building a + representative corpus that mirrors prod IDF distributions) + +Usage: + python3 scripts/import-projects.py search [limit] + python3 scripts/import-projects.py top [count] + +Examples: + python3 scripts/import-projects.py search "sodium" 5 + python3 scripts/import-projects.py top 1000 +""" + +import json +import subprocess +import sys +import time +import urllib.parse +import urllib.request + +ADMIN_USER_ID = 103587649610509 +DB_CONTAINER = "labrinth-postgres" +DB_USER = "labrinth" +DB_NAME = "labrinth" +API_BASE = "https://api.modrinth.com/v2" +HEADERS = {"User-Agent": "import-projects-script/1.0"} + +seen_slugs = set() +author_user_ids = {} +next_user_id = 200_000_000_000_000 + + +def api_get(url): + req = urllib.request.Request(url, headers=HEADERS) + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read().decode()) + + +def psql(sql): + result = subprocess.run( + [ + "podman", + "exec", + DB_CONTAINER, + "psql", + "-U", + DB_USER, + "-d", + DB_NAME, + "-c", + sql, + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + print(f" DB error: {result.stderr.strip()}", file=sys.stderr) + return False + return True + + +def sql_escape(s): + return s.replace("'", "''") + + +def get_or_create_author_user(author_name): + global next_user_id + if author_name in author_user_ids: + return author_user_ids[author_name] + uid = next_user_id + next_user_id += 1 + name_e = sql_escape(author_name) + sql = f""" + INSERT INTO users (id, username, email, created, role) + VALUES ({uid}, '{name_e}', '{name_e}@imported.local', NOW(), 'developer') + ON CONFLICT (id) DO NOTHING; + """ + if psql(sql): + author_user_ids[author_name] = uid + else: + author_user_ids[author_name] = ADMIN_USER_ID + return author_user_ids[author_name] + + +def import_project(hit, counter): + slug = hit.get("slug", "") + if slug in seen_slugs: + return False + seen_slugs.add(slug) + + title = hit.get("title", "") + summary = hit.get("description", "")[:2048] + project_id_api = hit.get("project_id", "") + downloads = hit.get("downloads", 0) + follows = hit.get("follows", 0) + icon_url = hit.get("icon_url") or None + author_name = hit.get("author", "Unknown") + + print(f" Fetching: {title}") + try: + project_data = api_get(f"{API_BASE}/project/{project_id_api}") + description = (project_data.get("body") or "")[:65536] + icon_url = project_data.get("icon_url") or icon_url + except Exception: + description = summary + + author_id = get_or_create_author_user(author_name) + + base = int(time.time() * 1e9) % 900_000_000_000_000 + 100_000_000_000_000 + mod_id = base + counter * 5 + team_id = base + counter * 5 + 1 + member_id = base + counter * 5 + 2 + version_id = base + counter * 5 + 3 + + title_e = sql_escape(title) + summary_e = sql_escape(summary) + description_e = sql_escape(description) + slug_e = sql_escape(slug) + icon_col = f"'{sql_escape(icon_url)}'" if icon_url else "NULL" + + print( + f" Importing: {title} (author={author_name}, downloads={downloads}, followers={follows})" + ) + + sql = f""" +BEGIN; + +INSERT INTO teams (id) VALUES ({team_id}); + +INSERT INTO mods ( + id, team_id, name, summary, description, + published, downloads, follows, + status, license, side_types_migration_review_status, + components, monetization_status, slug, + icon_url, raw_icon_url +) VALUES ( + {mod_id}, + {team_id}, + '{title_e}', + '{summary_e}', + '{description_e}', + NOW(), + {downloads}, + {follows}, + 'approved', + 'LicenseRef-All-Rights-Reserved', + 'reviewed', + '{{}}'::jsonb, + 'monetized', + LOWER('{slug_e}'), + {icon_col}, + {icon_col} +); + +INSERT INTO team_members ( + id, team_id, user_id, role, permissions, + accepted, payouts_split, ordering, is_owner +) VALUES ( + {member_id}, + {team_id}, + {author_id}, + 'Owner', + 1275068466, + true, + 1.00000000000000000000, + 0, + true +); + +INSERT INTO versions ( + id, mod_id, name, version_number, version_type, + author_id, downloads, changelog, status, components +) VALUES ( + {version_id}, + {mod_id}, + '1.0.0', + '1.0.0', + 'release', + {author_id}, + {downloads}, + '', + 'listed', + '{{}}'::jsonb +); + +INSERT INTO loaders_versions (loader_id, version_id) VALUES (2, {version_id}); + +COMMIT; +""" + return psql(sql) + + +def mode_search(query, limit=5): + encoded_query = urllib.parse.quote(query) + search_url = f"{API_BASE}/search?query={encoded_query}&limit={limit}&facets=[]" + print(f"Searching Modrinth for: {query} (limit: {limit})") + + search_data = api_get(search_url) + hits = search_data.get("hits", []) + + if not hits: + print("No results found.") + return + + imported = 0 + for i, hit in enumerate(hits): + if import_project(hit, i): + imported += 1 + + print(f"Done. Imported {imported} project(s).") + + +def mode_top(count=1000): + print(f"Fetching top {count} projects by downloads from Modrinth...") + + imported = 0 + batch_size = 50 + counter = 0 + + for offset in range(0, count, batch_size): + limit = min(batch_size, count - offset) + url = ( + f"{API_BASE}/search?limit={limit}&offset={offset}&index=downloads&facets=[]" + ) + print(f"\n Batch offset={offset}, limit={limit}") + + data = api_get(url) + hits = data.get("hits", []) + + if not hits: + break + + for hit in hits: + if import_project(hit, counter): + imported += 1 + counter += 1 + + time.sleep(1) + + print(f"\nDone. Imported {imported} project(s).") + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} search [limit]") + print(f" {sys.argv[0]} top [count]") + sys.exit(1) + + mode = sys.argv[1] + + if mode == "search": + if len(sys.argv) < 3: + print("Usage: {sys.argv[0]} search [limit]") + sys.exit(1) + query = sys.argv[2] + limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5 + mode_search(query, limit) + elif mode == "top": + count = int(sys.argv[2]) if len(sys.argv) > 2 else 1000 + mode_top(count) + else: + print(f"Unknown mode: {mode}. Use 'search' or 'top'.") + sys.exit(1) + + +if __name__ == "__main__": + main()