Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions nostromodb/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 2 ]]; then
echo "Usage: $0 <DATA_DIR> <DATABASE_PATH> [RESULT_FILE]"
exit 1
fi

# Arguments
DATA_DIR="$1"
DATABASE_PATH="$2"
RESULT_FILE="${3:-}"

# Print the database name
echo "Running queries on database: $DATABASE_PATH"

# Run queries and log the output
./run_queries.sh "$DATA_DIR" "$DATABASE_PATH" 2>&1 | tee query_log.txt

# Process the query log and prepare the result
RESULT=$(cat query_log.txt | grep -oP 'Real time: \d+\.\d+ seconds' | sed -r -e 's/Real time: ([0-9]+\.[0-9]+) seconds/\1/' | \
awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')

# Output the result
if [[ -n "$RESULT_FILE" ]]; then
echo "$RESULT" > "$RESULT_FILE"
echo "Result written to $RESULT_FILE"
else
echo "$RESULT"
fi
14 changes: 14 additions & 0 deletions nostromodb/count.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <data_path> <database_path> <table_name>"
exit 1
fi

# Arguments
DATA_PATH="$1"
DATABASE_PATH="$2"
TABLE_NAME="$3"

./run_statements.sh "$DATA_PATH" "$DATABASE_PATH" "select count(*) from $TABLE_NAME;"
21 changes: 21 additions & 0 deletions nostromodb/create_and_load.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 4 ]]; then
echo "Usage: $0 <DB_PATH> <TABLE_NAME> <DATA_DIRECTORY> <NUM_FILES>"
exit 1
fi

# Arguments
DB_PATH="$1"
TABLE_NAME="$2"
DATA_DIRECTORY="$3"
NUM_FILES="$4"

# Validate arguments
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error in create_and_load: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error in create_and_load: NUM_FILES must be a positive integer."; exit 1; }

# No need to issue explicit 'create table', just load the data...
echo "Load data"
./load_data.sh "$DATA_DIRECTORY" "$DB_PATH" "$TABLE_NAME" "$NUM_FILES"
14 changes: 14 additions & 0 deletions nostromodb/drop_table.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 1 ]]; then
echo "Usage: $0 <database_path>"
exit 1
fi

# Arguments
DATABASE_PATH="$1"

echo "Dropping database: $DATABASE_PATH"

rm -rf "${DATABASE_PATH}"
5 changes: 5 additions & 0 deletions nostromodb/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

sudo snap install docker
sudo apt-get update
docker pull svilenmihaylov/nostromodb:latest
40 changes: 40 additions & 0 deletions nostromodb/load_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 4 ]]; then
echo "Usage: $0 <data_directory> <database_path> <table_name> <max_files>"
exit 1
fi

# Arguments
DATA_DIR="$1"
DB_PATH="$2"
TABLE_NAME="$3"
MAX_FILES="$4"

# Validate that MAX_FILES is a number
if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
echo "Error: <max_files> must be a positive integer."
exit 1
fi

counter=0

# Loop through each .json.gz file in the directory
for file in $(ls "$DATA_DIR"/*.json.gz | sort); do
echo "Progress:" $counter "of" $MAX_FILES "files loaded"

if [[ -f "$file" ]]; then
base_name=$(basename ${file})
./run_statements.sh "$DATA_DIR" "$DB_PATH" "import from '/data/$base_name' into $TABLE_NAME options {'has_top_level_array': false}"
counter=$((counter + 1))
fi

# Stop processing if the max number of files is reached
if [[ $counter -ge $MAX_FILES ]]; then
echo "Copied maximum number of files: $MAX_FILES"
break
fi
done

./run_statements.sh "$DATA_DIR" "$DB_PATH" "pragma table_flush('$TABLE_NAME')"
77 changes: 77 additions & 0 deletions nostromodb/main.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash

DEFAULT_CHOICE=ask
DEFAULT_DATA_DIRECTORY=~/data/bluesky

# Allow the user to optionally provide the scale factor ("choice") as an argument
CHOICE="${1:-$DEFAULT_CHOICE}"

# Allow the user to optionally provide the data directory as an argument
DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"

# Define prefix for output files
OUTPUT_PREFIX="${5:-_m6i.8xlarge}"

# Check if the directory exists
if [[ ! -d "$DATA_DIRECTORY" ]]; then
echo "Error in main: Data directory '$DATA_DIRECTORY' does not exist."
exit 1
fi

if [ "$CHOICE" = "ask" ]; then
echo "Select the dataset size to benchmark:"
echo "1) 1m (default)"
echo "2) 10m"
echo "3) 100m"
echo "4) 1000m"
echo "5) all"
read -p "Enter the number corresponding to your choice: " CHOICE
fi

./install.sh

benchmark() {
local size=$1
# Check DATA_DIRECTORY contains the required number of files to run the benchmark
file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
if (( file_count < size )); then
echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
exit 1
fi

local DB_DIR=~/data/"nostromodb_jsonbench_${size}"
rm -rf "${DB_DIR}"
mkdir -p "${DB_DIR}"

./create_and_load.sh "$DB_DIR" bluesky "$DATA_DIRECTORY" "$size"
./total_size.sh "$DB_DIR" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
./count.sh "$DATA_DIRECTORY" "$DB_DIR" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
./query_results.sh "$DATA_DIRECTORY" "$DB_DIR" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
./physical_query_plans.sh "$DATA_DIRECTORY" "$DB_DIR" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans"
./benchmark.sh "$DATA_DIRECTORY" "$DB_DIR" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
./drop_table.sh "$DB_DIR"
}

case $CHOICE in
2)
benchmark 10
;;
3)
benchmark 100
;;
4)
benchmark 1000
;;
5)
benchmark 1
benchmark 10
benchmark 100
benchmark 1000
;;
*)
benchmark 1
;;
esac


./uninstall.sh
25 changes: 25 additions & 0 deletions nostromodb/physical_query_plans.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 2 ]]; then
echo "Usage: $0 <DATA_DIR> <DATABASE_PATH>"
exit 1
fi

# Arguments
DATA_DIR="$1"
DATABASE_PATH="$2"

QUERY_NUM=1

cat queries.sql | while read -r query; do
# Print the query number
echo "------------------------------------------------------------------------------------------------------------------------"
echo "Physical query plan for query Q$QUERY_NUM:"
echo

./run_statements.sh "$DATA_DIR" "$DATABASE_PATH" "EXPLAIN $query"

# Increment the query number
QUERY_NUM=$((QUERY_NUM + 1))
done;
5 changes: 5 additions & 0 deletions nostromodb/queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
select commit.collection as event, count(*) as count from bluesky group by event order by count desc, event asc;
select commit.collection as event, count(*) as count, count(distinct did) as users from bluesky where kind == 'commit' and commit.operation = 'create' group by event order by count desc;
select commit.collection as event, extract(hour from epoch_time_to_timestamp(time_us/1000000)) as hour_of_day, count(*) as count from bluesky where kind = 'commit' AND commit.operation = 'create' and commit.collection IN ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] group by event, hour_of_day order by hour_of_day, event;
select did as user_id, min(epoch_time_to_timestamp(time_us/1000000)) as first_post_ts from bluesky where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' group by user_id order by first_post_ts asc limit 3;
select did as user_id, 1000*extract(epoch from date_diff(coerce_to_str(max(epoch_time_to_timestamp(time_us/1000000))), coerce_to_str(min(epoch_time_to_timestamp(time_us/1000000))))) as activity_span from bluesky where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' group by user_id order by activity_span desc limit 3;
64 changes: 64 additions & 0 deletions nostromodb/queries_formatted.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
------------------------------------------------------------------------------------------------------------------------
-- Q1 - Top event types
------------------------------------------------------------------------------------------------------------------------

select
commit.collection as event,
count(*) as count
from bluesky
group by event
order by count desc;

------------------------------------------------------------------------------------------------------------------------
-- Q2 - Top event types together with unique users per event type
------------------------------------------------------------------------------------------------------------------------

select
commit.collection as event,
count(*) as count,
count(distinct did) as users
from bluesky
where kind == 'commit' and commit.operation = 'create'
group by event
order by count desc;

------------------------------------------------------------------------------------------------------------------------
-- Q3 - When do people use BlueSky
------------------------------------------------------------------------------------------------------------------------

select
commit.collection as event,
extract(hour from epoch_time_to_timestamp(time_us/1000000)) as hour_of_day,
count(*) as count
from bluesky
where kind = 'commit' AND commit.operation = 'create' and commit.collection IN ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like']
group by event, hour_of_day
order by hour_of_day, event;

------------------------------------------------------------------------------------------------------------------------
-- Q4 - top 3 post veterans
------------------------------------------------------------------------------------------------------------------------

select
did as user_id,
min(epoch_time_to_timestamp(time_us/1000000)) as first_post_ts
from bluesky
where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post'
group by user_id
order by first_post_ts asc
limit 3;

------------------------------------------------------------------------------------------------------------------------
-- Q5 - top 3 users with longest activity
------------------------------------------------------------------------------------------------------------------------

select
did as user_id,
1000*extract(epoch from date_diff(
coerce_to_str(max(epoch_time_to_timestamp(time_us/1000000))),
coerce_to_str(min(epoch_time_to_timestamp(time_us/1000000))))) as activity_span
from bluesky
where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post'
group by user_id
order by activity_span desc
limit 3;
20 changes: 20 additions & 0 deletions nostromodb/query_log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Running query: select commit.collection as event, count(*) as count from bluesky group by event order by count desc, event asc;
Real time: 0.02692273 seconds
Real time: 0.01694564 seconds
Real time: 0.01638707 seconds
Running query: select commit.collection as event, count(*) as count, count(distinct did) as users from bluesky where kind == 'commit' and commit.operation = 'create' group by event order by count desc;
Real time: 0.09046148 seconds
Real time: 0.09066723 seconds
Real time: 0.06633239 seconds
Running query: select commit.collection as event, extract(hour from epoch_time_to_timestamp(time_us/1000000)) as hour_of_day, count(*) as count from bluesky where kind = 'commit' AND commit.operation = 'create' and commit.collection IN ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] group by event, hour_of_day order by hour_of_day, event;
Real time: 0.07178712 seconds
Real time: 0.05345615 seconds
Real time: 0.07188973 seconds
Running query: select did as user_id, min(epoch_time_to_timestamp(time_us/1000000)) as first_post_ts from bluesky where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' group by user_id order by first_post_ts asc limit 3;
Real time: 0.08329340 seconds
Real time: 0.05289203 seconds
Real time: 0.05629961 seconds
Running query: select did as user_id, 1000*extract(epoch from date_diff(coerce_to_str(max(epoch_time_to_timestamp(time_us/1000000))), coerce_to_str(min(epoch_time_to_timestamp(time_us/1000000))))) as activity_span from bluesky where kind = 'commit' and commit.operation = 'create' and commit.collection = 'app.bsky.feed.post' group by user_id order by activity_span desc limit 3;
Real time: 0.08881190 seconds
Real time: 0.06515721 seconds
Real time: 0.06154216 seconds
25 changes: 25 additions & 0 deletions nostromodb/query_results.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 2 ]]; then
echo "Usage: $0 <DATA_DIR> <DATABASE_PATH>"
exit 1
fi

# Arguments
DATA_DIR="$1"
DATABASE_PATH="$2"

QUERY_NUM=1

cat queries.sql | while read -r query; do
# Print the query
echo "------------------------------------------------------------------------------------------------------------------------"
echo "Result for query Q$QUERY_NUM:"
echo

./run_statements.sh "$DATA_DIR" "$DATABASE_PATH" "$query"

# Increment the query number
QUERY_NUM=$((QUERY_NUM + 1))
done;
7 changes: 7 additions & 0 deletions nostromodb/results/_m6i.8xlarge_bluesky_1m.count
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Opened database at '/db/catalog.json'
╭───────────────╮
│ count(*): int │
├───────────────┤
│ 1000000 │
╰───────────────╯
1 value(s) returned.
1 change: 1 addition & 0 deletions nostromodb/results/_m6i.8xlarge_bluesky_1m.data_size
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
209420288
Loading