Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/labrinth/.env.local
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ DATABASE_URL=postgresql://labrinth:labrinth@localhost/labrinth
DATABASE_MIN_CONNECTIONS=0
DATABASE_MAX_CONNECTIONS=16

SEARCH_BACKEND=meilisearch
SEARCH_BACKEND=typesense

# Meilisearch configuration
MEILISEARCH_READ_ADDR=http://localhost:7700
Expand Down
51 changes: 38 additions & 13 deletions apps/labrinth/src/search/backend/typesense/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ pub struct RequestConfig {
pub prioritize_exact_match: bool,
#[serde(default = "default_prioritize_num_matching_fields")]
pub prioritize_num_matching_fields: bool,
#[serde(default = "default_prioritize_token_positions")]
pub prioritize_token_positions: bool,
#[serde(default = "default_drop_tokens_threshold")]
pub drop_tokens_threshold: usize,
#[serde(default)]
pub text_match_type: TextMatchType,
#[serde(default)]
Expand All @@ -98,32 +102,37 @@ impl Default for RequestConfig {
prioritize_exact_match: default_prioritize_exact_match(),
prioritize_num_matching_fields:
default_prioritize_num_matching_fields(),
prioritize_token_positions: default_prioritize_token_positions(),
drop_tokens_threshold: default_drop_tokens_threshold(),
text_match_type: TextMatchType::default(),
bucketing: Bucketing::default(),
}
}
}

fn default_query_by() -> Vec<String> {
[
"name",
"indexed_name",
"slug",
"author",
"indexed_author",
"summary",
]
.into_iter()
.map(str::to_string)
.collect()
// [
// "name",
// "indexed_name",
// "slug",
// "author",
// "indexed_author",
// "summary",
// ]
["name", "indexed_name", "slug", "author", "indexed_author"]
.into_iter()
.map(str::to_string)
.collect()
}

fn default_query_by_weights() -> Vec<u8> {
vec![15, 15, 10, 3, 3, 1]
// vec![15, 15, 10, 3, 3, 1]
vec![15, 15, 10, 3, 3]
}

fn default_prefix() -> Vec<bool> {
vec![true, true, true, true, true, true]
// vec![true, true, true, true, true, true]
vec![true, true, true, true, true]
}

const fn default_prioritize_exact_match() -> bool {
Expand All @@ -134,6 +143,14 @@ const fn default_prioritize_num_matching_fields() -> bool {
false
}

const fn default_prioritize_token_positions() -> bool {
true
}

const fn default_drop_tokens_threshold() -> usize {
0
}

impl TypesenseConfig {
pub fn new(meta_namespace: Option<String>) -> Self {
Self {
Expand Down Expand Up @@ -696,6 +713,14 @@ impl SearchBackend for Typesense {
.prioritize_num_matching_fields
.to_string(),
),
(
"prioritize_token_positions",
info.typesense_config.prioritize_token_positions.to_string(),
),
(
"drop_tokens_threshold",
info.typesense_config.drop_tokens_threshold.to_string(),
),
(
"text_match_type",
info.typesense_config.text_match_type.as_str().to_string(),
Expand Down
273 changes: 273 additions & 0 deletions scripts/import-projects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
#!/usr/bin/env python3
"""
Search projects on api.modrinth.com and import results into the local database
with correct author names.

Modes:
search - Import top N results for a text query
top - Import the top N projects by total downloads (for building a
representative corpus that mirrors prod IDF distributions)

Usage:
python3 scripts/import-projects.py search <query> [limit]
python3 scripts/import-projects.py top [count]

Examples:
python3 scripts/import-projects.py search "sodium" 5
python3 scripts/import-projects.py top 1000
"""

import json
import subprocess
import sys
import time
import urllib.parse
import urllib.request

ADMIN_USER_ID = 103587649610509
DB_CONTAINER = "labrinth-postgres"
DB_USER = "labrinth"
DB_NAME = "labrinth"
API_BASE = "https://api.modrinth.com/v2"
HEADERS = {"User-Agent": "import-projects-script/1.0"}

seen_slugs = set()
author_user_ids = {}
next_user_id = 200_000_000_000_000


def api_get(url):
req = urllib.request.Request(url, headers=HEADERS)
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read().decode())


def psql(sql):
result = subprocess.run(
[
"podman",
"exec",
DB_CONTAINER,
"psql",
"-U",
DB_USER,
"-d",
DB_NAME,
"-c",
sql,
],
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f" DB error: {result.stderr.strip()}", file=sys.stderr)
return False
return True


def sql_escape(s):
return s.replace("'", "''")


def get_or_create_author_user(author_name):
global next_user_id
if author_name in author_user_ids:
return author_user_ids[author_name]
uid = next_user_id
next_user_id += 1
name_e = sql_escape(author_name)
sql = f"""
INSERT INTO users (id, username, email, created, role)
VALUES ({uid}, '{name_e}', '{name_e}@imported.local', NOW(), 'developer')
ON CONFLICT (id) DO NOTHING;
"""
if psql(sql):
author_user_ids[author_name] = uid
else:
author_user_ids[author_name] = ADMIN_USER_ID
return author_user_ids[author_name]


def import_project(hit, counter):
slug = hit.get("slug", "")
if slug in seen_slugs:
return False
seen_slugs.add(slug)

title = hit.get("title", "")
summary = hit.get("description", "")[:2048]
project_id_api = hit.get("project_id", "")
downloads = hit.get("downloads", 0)
follows = hit.get("follows", 0)
icon_url = hit.get("icon_url") or None
author_name = hit.get("author", "Unknown")

print(f" Fetching: {title}")
try:
project_data = api_get(f"{API_BASE}/project/{project_id_api}")
description = (project_data.get("body") or "")[:65536]
icon_url = project_data.get("icon_url") or icon_url
except Exception:
description = summary

author_id = get_or_create_author_user(author_name)

base = int(time.time() * 1e9) % 900_000_000_000_000 + 100_000_000_000_000
mod_id = base + counter * 5
team_id = base + counter * 5 + 1
member_id = base + counter * 5 + 2
version_id = base + counter * 5 + 3

title_e = sql_escape(title)
summary_e = sql_escape(summary)
description_e = sql_escape(description)
slug_e = sql_escape(slug)
icon_col = f"'{sql_escape(icon_url)}'" if icon_url else "NULL"

print(
f" Importing: {title} (author={author_name}, downloads={downloads}, followers={follows})"
)

sql = f"""
BEGIN;

INSERT INTO teams (id) VALUES ({team_id});

INSERT INTO mods (
id, team_id, name, summary, description,
published, downloads, follows,
status, license, side_types_migration_review_status,
components, monetization_status, slug,
icon_url, raw_icon_url
) VALUES (
{mod_id},
{team_id},
'{title_e}',
'{summary_e}',
'{description_e}',
NOW(),
{downloads},
{follows},
'approved',
'LicenseRef-All-Rights-Reserved',
'reviewed',
'{{}}'::jsonb,
'monetized',
LOWER('{slug_e}'),
{icon_col},
{icon_col}
);

INSERT INTO team_members (
id, team_id, user_id, role, permissions,
accepted, payouts_split, ordering, is_owner
) VALUES (
{member_id},
{team_id},
{author_id},
'Owner',
1275068466,
true,
1.00000000000000000000,
0,
true
);

INSERT INTO versions (
id, mod_id, name, version_number, version_type,
author_id, downloads, changelog, status, components
) VALUES (
{version_id},
{mod_id},
'1.0.0',
'1.0.0',
'release',
{author_id},
{downloads},
'',
'listed',
'{{}}'::jsonb
);

INSERT INTO loaders_versions (loader_id, version_id) VALUES (2, {version_id});

COMMIT;
"""
return psql(sql)


def mode_search(query, limit=5):
encoded_query = urllib.parse.quote(query)
search_url = f"{API_BASE}/search?query={encoded_query}&limit={limit}&facets=[]"
print(f"Searching Modrinth for: {query} (limit: {limit})")

search_data = api_get(search_url)
hits = search_data.get("hits", [])

if not hits:
print("No results found.")
return

imported = 0
for i, hit in enumerate(hits):
if import_project(hit, i):
imported += 1

print(f"Done. Imported {imported} project(s).")


def mode_top(count=1000):
print(f"Fetching top {count} projects by downloads from Modrinth...")

imported = 0
batch_size = 50
counter = 0

for offset in range(0, count, batch_size):
limit = min(batch_size, count - offset)
url = (
f"{API_BASE}/search?limit={limit}&offset={offset}&index=downloads&facets=[]"
)
print(f"\n Batch offset={offset}, limit={limit}")

data = api_get(url)
hits = data.get("hits", [])

if not hits:
break

for hit in hits:
if import_project(hit, counter):
imported += 1
counter += 1

time.sleep(1)

print(f"\nDone. Imported {imported} project(s).")


def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} search <query> [limit]")
print(f" {sys.argv[0]} top [count]")
sys.exit(1)

mode = sys.argv[1]

if mode == "search":
if len(sys.argv) < 3:
print("Usage: {sys.argv[0]} search <query> [limit]")
sys.exit(1)
query = sys.argv[2]
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5
mode_search(query, limit)
elif mode == "top":
count = int(sys.argv[2]) if len(sys.argv) > 2 else 1000
mode_top(count)
else:
print(f"Unknown mode: {mode}. Use 'search' or 'top'.")
sys.exit(1)


if __name__ == "__main__":
main()
Loading