diff --git a/slothdb-parquet/README.md b/slothdb-parquet/README.md new file mode 100644 index 000000000..3d838f616 --- /dev/null +++ b/slothdb-parquet/README.md @@ -0,0 +1,14 @@ +# SlothDB (Parquet, single) + +Stateless variant of [`slothdb/`](../slothdb): the dataset is *not* ingested +into a SlothDB table. Instead, `create.sql` defines a `VIEW` over +`hits.parquet` via `read_parquet(...)` and queries scan the parquet file +directly at query time. The view is persisted in `hits.slothdb` so each +`./query` invocation sees it without re-creating it. + +`./data-size` reports the on-disk size of `hits.parquet` (the only source +of truth in this configuration). + +The view normalises `EventDate` and the `*Time` columns into proper +`DATE` / `TIMESTAMP` values, so `queries.sql` is the same one used by +`slothdb/` — no `toDateTime`-style macro is required. diff --git a/slothdb-parquet/benchmark.sh b/slothdb-parquet/benchmark.sh new file mode 100755 index 000000000..617422ddc --- /dev/null +++ b/slothdb-parquet/benchmark.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_DURABLE=yes +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/slothdb-parquet/check b/slothdb-parquet/check new file mode 100755 index 000000000..26b993503 --- /dev/null +++ b/slothdb-parquet/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +slothdb -c "SELECT 1" >/dev/null diff --git a/slothdb-parquet/create.sql b/slothdb-parquet/create.sql new file mode 100644 index 000000000..dde432576 --- /dev/null +++ b/slothdb-parquet/create.sql @@ -0,0 +1,8 @@ +-- slothdb's parser does not support DuckDB's `SELECT * REPLACE (...)` +-- syntax, so we cannot transparently rewrite EventTime/EventDate inside +-- the view definition. Leave the columns as their parquet types +-- (EventDate as INT32 days-since-epoch, *Time as INT32 unix seconds); +-- Q19/Q43 may fail with a type error on extract/DATE_TRUNC, matching +-- slothdb's own bench/run.py setup which queries `FROM 'hits.parquet'` +-- directly without type rewrites. +CREATE VIEW hits AS SELECT * FROM 'hits.parquet'; diff --git a/slothdb-parquet/data-size b/slothdb-parquet/data-size new file mode 100755 index 000000000..1aecba4a1 --- /dev/null +++ b/slothdb-parquet/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +du -bcs hits*.parquet | awk '/total$/ { print $1 }' diff --git a/slothdb-parquet/install b/slothdb-parquet/install new file mode 100755 index 000000000..40269c679 --- /dev/null +++ b/slothdb-parquet/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +# slothdb links against libgomp at runtime; Ubuntu 24.04 cloud images +# don't ship it by default and the binary then refuses to start with +# "error while loading shared libraries: libgomp.so.1". +sudo apt-get install -y libgomp1 + +if ! command -v slothdb >/dev/null 2>&1; then + # SlothDB only publishes a prebuilt linux-x64 binary as a GitHub release + # asset; there is no apt/dnf/brew package on Linux yet. Pin to a known + # release rather than `latest` so the benchmark stays reproducible. + VERSION="v0.2.6" + TMP_FILE=$(mktemp) + curl -fsSL \ + "https://github.com/SouravRoy-ETL/slothdb/releases/download/${VERSION}/slothdb-linux-x64" \ + -o "$TMP_FILE" + chmod +x "$TMP_FILE" + sudo mv "$TMP_FILE" /usr/local/bin/slothdb +fi diff --git a/slothdb-parquet/load b/slothdb-parquet/load new file mode 100755 index 000000000..d3b5c1327 --- /dev/null +++ b/slothdb-parquet/load @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# create.sql defines a VIEW over hits.parquet — no ingestion happens, the +# parquet file is read in place at query time. We persist the view in +# hits.slothdb so subsequent query invocations see it. +rm -f hits.slothdb +slothdb hits.slothdb -c "$(cat create.sql)" +sync diff --git a/slothdb-parquet/queries.sql b/slothdb-parquet/queries.sql new file mode 100644 index 000000000..b4115ee3a --- /dev/null +++ b/slothdb-parquet/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/slothdb-parquet/query b/slothdb-parquet/query new file mode 100755 index 000000000..e4a713f06 --- /dev/null +++ b/slothdb-parquet/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via slothdb against hits.slothdb +# (which contains a VIEW over hits.parquet). +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +# +# slothdb has no built-in `.timer` so we measure wall-clock around the +# subprocess. +set -e + +query=$(cat) + +err_file=$(mktemp) +trap 'rm -f "$err_file"' EXIT + +start_t=$(date +%s.%N) +out=$(slothdb hits.slothdb -c "$query" 2>"$err_file") && status=0 || status=$? +end_t=$(date +%s.%N) + +if [ "$status" -ne 0 ]; then + cat "$err_file" >&2 + exit "$status" +fi + +printf '%s\n' "$out" +awk -v s="$start_t" -v e="$end_t" 'BEGIN { printf "%.6f\n", e - s }' >&2 diff --git a/slothdb-parquet/start b/slothdb-parquet/start new file mode 100755 index 000000000..6b51ad42f --- /dev/null +++ b/slothdb-parquet/start @@ -0,0 +1,3 @@ +#!/bin/bash +# slothdb is embedded — no daemon to start. +exit 0 diff --git a/slothdb-parquet/stop b/slothdb-parquet/stop new file mode 100755 index 000000000..461b01a1b --- /dev/null +++ b/slothdb-parquet/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# slothdb is an embedded CLI tool — no daemon to stop. +exit 0 diff --git a/slothdb-parquet/template.json b/slothdb-parquet/template.json new file mode 100644 index 000000000..8166ac3f1 --- /dev/null +++ b/slothdb-parquet/template.json @@ -0,0 +1,12 @@ +{ + "system": "SlothDB (Parquet, single)", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "embedded", + "stateless" + ] +} diff --git a/slothdb/README.md b/slothdb/README.md new file mode 100644 index 000000000..d90ae8379 --- /dev/null +++ b/slothdb/README.md @@ -0,0 +1,18 @@ +# SlothDB + +[SlothDB](https://github.com/SouravRoy-ETL/slothdb) is an embedded analytical +columnar SQL database written in C++20. It is invoked as a single-binary CLI +(`slothdb`) similar to `duckdb` and operates on a `.slothdb` database file. + +The install script downloads a pinned prebuilt Linux x86-64 binary from the +upstream GitHub release. Only an `x86_64` Linux binary is published, so the +benchmark currently runs on x86-64 ClickBench machines (c6a.\*, c7a.\*); on +ARM hosts (c8g.\*) the binary will not execute. + +The CLI has no built-in query timer, so `./query` wraps each invocation with +wall-clock measurement (`date +%s.%N`), the same approach used for systems +without internal timing (e.g. `presto`, `mongodb`). + +SlothDB still has a few SQL gaps as of v0.2.6 (per upstream's `bench/clickbench/README.md`), +so some queries are expected to fail and be recorded as `null` in the +result row. diff --git a/slothdb/benchmark.sh b/slothdb/benchmark.sh new file mode 100755 index 000000000..617422ddc --- /dev/null +++ b/slothdb/benchmark.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_DURABLE=yes +export BENCH_RESTARTABLE=no +exec ../lib/benchmark-common.sh diff --git a/slothdb/check b/slothdb/check new file mode 100755 index 000000000..26b993503 --- /dev/null +++ b/slothdb/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +slothdb -c "SELECT 1" >/dev/null diff --git a/slothdb/create.sql b/slothdb/create.sql new file mode 100644 index 000000000..c31434c1b --- /dev/null +++ b/slothdb/create.sql @@ -0,0 +1,9 @@ +-- Schema is inferred by CTAS in `./load`: +-- CREATE TABLE hits AS SELECT * FROM 'hits.parquet'; +-- +-- slothdb's parser doesn't accept DuckDB's `SELECT * REPLACE (...)` +-- rewrite, so the typed CREATE TABLE used by ClickBench in general +-- (BIGINT WatchID, TEXT URL, TIMESTAMP EventTime, ...) is not loaded +-- explicitly; the column types come from the parquet file's own +-- logical types. EventDate / *Time stay as integers rather than +-- becoming DATE / TIMESTAMP. diff --git a/slothdb/data-size b/slothdb/data-size new file mode 100755 index 000000000..aee106562 --- /dev/null +++ b/slothdb/data-size @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +wc -c < hits.slothdb diff --git a/slothdb/install b/slothdb/install new file mode 100755 index 000000000..40269c679 --- /dev/null +++ b/slothdb/install @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +# slothdb links against libgomp at runtime; Ubuntu 24.04 cloud images +# don't ship it by default and the binary then refuses to start with +# "error while loading shared libraries: libgomp.so.1". +sudo apt-get install -y libgomp1 + +if ! command -v slothdb >/dev/null 2>&1; then + # SlothDB only publishes a prebuilt linux-x64 binary as a GitHub release + # asset; there is no apt/dnf/brew package on Linux yet. Pin to a known + # release rather than `latest` so the benchmark stays reproducible. + VERSION="v0.2.6" + TMP_FILE=$(mktemp) + curl -fsSL \ + "https://github.com/SouravRoy-ETL/slothdb/releases/download/${VERSION}/slothdb-linux-x64" \ + -o "$TMP_FILE" + chmod +x "$TMP_FILE" + sudo mv "$TMP_FILE" /usr/local/bin/slothdb +fi diff --git a/slothdb/load b/slothdb/load new file mode 100755 index 000000000..184ead2ce --- /dev/null +++ b/slothdb/load @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +# Idempotent: blow away any prior partial DB. +rm -f hits.slothdb + +# Use CTAS — slothdb's parser doesn't support DuckDB's `SELECT * REPLACE (...)` +# (its REPLACE keyword is reserved for CREATE OR REPLACE), and a full +# column-typed CREATE TABLE has its own gaps. CTAS infers types directly +# from the parquet. The *Time columns stay as integers (unix seconds) +# rather than TIMESTAMPs; Q19/Q43 (extract / DATE_TRUNC on EventTime) +# may therefore fail with a type error, matching slothdb's own bench +# setup which runs against `FROM 'hits.parquet'` directly without +# converting time columns. +slothdb hits.slothdb -c "CREATE TABLE hits AS SELECT * FROM 'hits.parquet';" + +rm -f hits.parquet +sync diff --git a/slothdb/queries.sql b/slothdb/queries.sql new file mode 100644 index 000000000..b4115ee3a --- /dev/null +++ b/slothdb/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/slothdb/query b/slothdb/query new file mode 100755 index 000000000..6b403d4bc --- /dev/null +++ b/slothdb/query @@ -0,0 +1,27 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via slothdb against hits.slothdb. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line. +# Exit non-zero on error. +# +# slothdb has no built-in `.timer` so we measure wall-clock around the +# subprocess. The CLI's startup overhead is small (single static binary) +# and consistent with how DuckDB's `Run Time` is reported. +set -e + +query=$(cat) + +err_file=$(mktemp) +trap 'rm -f "$err_file"' EXIT + +start_t=$(date +%s.%N) +out=$(slothdb hits.slothdb -c "$query" 2>"$err_file") && status=0 || status=$? +end_t=$(date +%s.%N) + +if [ "$status" -ne 0 ]; then + cat "$err_file" >&2 + exit "$status" +fi + +printf '%s\n' "$out" +awk -v s="$start_t" -v e="$end_t" 'BEGIN { printf "%.6f\n", e - s }' >&2 diff --git a/slothdb/start b/slothdb/start new file mode 100755 index 000000000..6b51ad42f --- /dev/null +++ b/slothdb/start @@ -0,0 +1,3 @@ +#!/bin/bash +# slothdb is embedded — no daemon to start. +exit 0 diff --git a/slothdb/stop b/slothdb/stop new file mode 100755 index 000000000..461b01a1b --- /dev/null +++ b/slothdb/stop @@ -0,0 +1,3 @@ +#!/bin/bash +# slothdb is an embedded CLI tool — no daemon to stop. +exit 0 diff --git a/slothdb/template.json b/slothdb/template.json new file mode 100644 index 000000000..af9da6d15 --- /dev/null +++ b/slothdb/template.json @@ -0,0 +1,11 @@ +{ + "system": "SlothDB", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "embedded" + ] +}