On Github jasonaowen / pdxpug-incremental-schema
or
by
@jasonaowen
https://jasonaowen.github.io/pdxpug-incremental-schema https://github.com/jasonaowen/pdxpug-incremental-schema
https://www.githubarchive.org/
wget + cron
Gzipped, Line-Delimited JSON
February 18th, 5pm Pacific time: http://data.githubarchive.org/2016-02-19-01.json.gz
Python is pretty cool
CREATE TABLE raw_events ( id SERIAL PRIMARY KEY, event JSONB NOT NULL )
Unicode allows escaped 0 bytes, but if you try to look at a JSON object with one:
unsupported Unicode escape sequence
DETAIL: \u0000 cannot be converted to text.
CONTEXT: JSON data, line 1: ...yload": {"action": "created", "comment": {"body":...
The jsonb type also rejects \u0000 (because that cannot be represented in PostgreSQL's text type)
http://www.postgresql.org/docs/9.5/static/datatype-json.html
zcat 2016-02-19-01.json.gz | grep '\\u0000' | wc -l
The double backslash matters.
other approaches are valid, here; you could halt the import, or substitute a placeholder using sed.zcat 2016-02-19-01.json.gz | grep -v '\\u0000' | python json-to-postgres.py owenja pdxpug raw_events event
SELECT nspname || '.' || relname AS "relation",
pg_size_pretty(pg_relation_size(C.oid)) AS "size"
FROM pg_class C
LEFT JOIN pg_namespace N
ON (N.oid = C.relnamespace)
WHERE nspname NOT IN ('pg_catalog', 'information_schema')
ORDER BY pg_relation_size(C.oid) DESC;
will someone write this down?CREATE TABLE events (
id INTEGER PRIMARY KEY
REFERENCES raw_events(id),
event JSONB NOT NULL
);
INSERT INTO events (id, event)
SELECT id, event
FROM raw_events;
Scalars:
Containers:
{
"created_at": "2011-09-06T17:26:27Z",
"id": 12345
}
keys and values. Keys are always strings; values can be any JSON type.
Keys may not be unique!SELECT jsonb_typeof(event),
COUNT(*)
FROM events
GROUP BY jsonb_typeof(event);
CREATE FUNCTION type_count(in table_name text, in column_name text)
RETURNS TABLE (json_type text, count bigint) AS
$func$
BEGIN
RETURN QUERY EXECUTE format(
'SELECT jsonb_typeof("%s"),
COUNT(*)
FROM "%s"
GROUP BY jsonb_typeof("%s")',
column_name, table_name, column_name);
END
$func$ LANGUAGE plpgsql STABLE;
select * from type_count('events', 'event');
SELECT jsonb_object_keys(event),
COUNT(*)
FROM events
GROUP BY jsonb_object_keys(event);
CREATE FUNCTION key_count(in table_name text, in column_name text)
RETURNS TABLE (key text, count bigint) AS
$func$
BEGIN
RETURN QUERY EXECUTE format(
'SELECT jsonb_object_keys("%s"),
COUNT(*)
FROM "%s"
GROUP BY jsonb_object_keys("%s")',
column_name, table_name, column_name);
END
$func$ LANGUAGE plpgsql STABLE;
select * from key_count('events', 'event');
Now that we know what's in those objects, let's get them into columns.
ALTER TABLE events ADD COLUMN actor JSONB, ADD COLUMN created_at JSONB, ADD COLUMN github_id JSONB, ADD COLUMN org JSONB, ADD COLUMN payload JSONB, ADD COLUMN public JSONB, ADD COLUMN repo JSONB, ADD COLUMN type JSONB;
UPDATE events
SET actor = event->'actor',
created_at = event->'created_at',
github_id = event->'github_id',
org = event->'org',
payload = event->'payload',
public = event->'public',
repo = event->'repo',
type = event->'type';
CREATE FUNCTION explode_json_column(
in table_name text,
in column_name text
) RETURNS void AS
$func$
DECLARE
record RECORD;
BEGIN
FOR record IN SELECT key, count
FROM key_count(table_name, column_name) LOOP
EXECUTE format('ALTER TABLE "%s" ADD COLUMN "%s" JSONB',
table_name, record.key);
EXECUTE format('UPDATE "%s" SET "%s" = "%s"->''%s''',
table_name, record.key, column_name, record.key);
END LOOP;
END
$func$ LANGUAGE plpgsql;
PostgreSQL seems not to be able to automatically, implicitly convert between a JSON[B] type and another scalar.
SELECT * FROM type_count('events', 'created_at');
SELECT created_at FROM events LIMIT 5;
ALTER TABLE events
ALTER COLUMN created_at
TYPE TIMESTAMP
USING created_at::TEXT::TIMESTAMP;
SELECT * FROM type_count('events', 'public');
SELECT public FROM events LIMIT 5;
ALTER TABLE events
ALTER COLUMN public
TYPE BOOLEAN
USING public::TEXT::BOOLEAN;
the public field in this dataset is always a True value; probably you
could drop it.SELECT * FROM type_count('events', 'type');
SELECT type FROM events LIMIT 5;
ALTER TABLE events
ALTER COLUMN type
TYPE TEXT
USING type::TEXT::TEXT;
SELECT type FROM events LIMIT 5; UPDATE events SET type = TRIM(BOTH '"' FROM type);
CREATE FUNCTION set_concrete_type(
in table_name text,
in column_name text,
in type_name text
) RETURNS void AS
$func$
BEGIN
EXECUTE format(
'ALTER TABLE "%s" ALTER COLUMN "%s" TYPE %s
USING TRIM(BOTH ''"'' FROM "%s"::TEXT)::%s',
table_name, column_name, type_name, column_name, type_name);
END
$func$ LANGUAGE plpgsql;
SELECT * FROM type_count('events', 'github_id');
SELECT github_id FROM events LIMIT 5;
SELECT set_concrete_type('events', 'github_id', 'BIGINT');
Let's normalize a little.
SELECT * FROM type_count('events', 'actor');
SELECT COUNT(DISTINCT actor) FROM events;
SELECT * FROM key_count('events', 'actor');
CREATE TABLE actors ( actor_id SERIAL PRIMARY KEY, payload JSONB UNIQUE NOT NULL ); INSERT INTO actors (payload) SELECT DISTINCT actor FROM events;
ALTER TABLE events
ADD COLUMN actor_id INTEGER
REFERENCES actors (actor_id);
UPDATE events
SET actor_id =
(SELECT actor_id FROM actors WHERE actor = payload);
without that UNIQUE constraint, this update would have taken far
longer.ALTER TABLE events ALTER COLUMN actor_id SET NOT NULL, DROP COLUMN actor;
SELECT explode_json_column('actors', 'payload');
SELECT * FROM type_count('actors', 'avatar_url');
SELECT set_concrete_type('actors', 'avatar_url', 'TEXT');
SELECT * FROM type_count('actors', 'id');
SELECT set_concrete_type('actors', 'id', 'BIGINT');
SELECT * FROM type_count('actors', 'login');
SELECT set_concrete_type('actors', 'login', 'TEXT');
SELECT * FROM type_count('actors', 'url');
SELECT set_concrete_type('actors', 'url', 'TEXT');
SELECT * FROM type_count('actors', 'gravatar_id');
SELECT DISTINCT gravatar_id FROM actors;
ALTER TABLE actors DROP COLUMN gravatar_id;
If it's always empty, do we still need it?
ALTER TABLE actors DROP COLUMN payload;
The payload type is a little different.
SELECT * FROM key_count('events', 'payload');
SELECT type,
jsonb_object_keys(event),
COUNT(*)
FROM events
GROUP BY type, jsonb_object_keys(event)
CREATE TABLE pushes (
id SERIAL PRIMARY KEY,
event_id INTEGER NOT NULL
REFERENCES events(id),
payload JSONB NOT NULL);
INSERT INTO pushes (event_id, payload)
SELECT id, payload FROM events WHERE type = 'PushEvent';
Note that the foreign key points the other way!
SELECT * FROM key_count('pushes', 'payload');
SELECT explode_json_column('pushes', 'payload');
SELECT set_concrete_type('pushes', 'before', 'TEXT');
SELECT set_concrete_type('pushes', 'distinct_size', 'INTEGER');
SELECT set_concrete_type('pushes', 'head', 'TEXT');
SELECT set_concrete_type('pushes', 'push_id', 'BIGINT');
SELECT set_concrete_type('pushes', 'ref', 'TEXT');
SELECT set_concrete_type('pushes', 'size', 'INTEGER');
SELECT * FROM type_count('pushes', 'commits');
SELECT commits->0 FROM pushes WHERE id = 1;
SELECT jsonb_array_elements(commits) FROM pushes WHERE id = 1;
CREATE TABLE push_commits (
id SERIAL PRIMARY KEY,
push_id INTEGER NOT NULL
REFERENCES PUSHES(id),
commit JSONB NOT NULL
);
INSERT INTO push_commits (push_id, commit)
SELECT id, jsonb_array_elements(commits) FROM pushes;
CREATE TABLE commits ( id SERIAL PRIMARY KEY, payload JSONB NOT NULL ); INSERT INTO commits (payload) SELECT DISTINCT commit FROM push_commits;
ALTER TABLE push_commits
ADD COLUMN commit_id INTEGER
REFERENCES commits (id);
CREATE UNIQUE INDEX ON commits (payload);
CREATE UNIQUE INDEX ON commits (MD5(payload::TEXT));
CREATE INDEX ON push_commits (MD5(commit::TEXT));
we don't need a unique index on push_commits because the same commit
can be in multiple pushes.UPDATE push_commits SET commit_id = (SELECT id FROM commits WHERE MD5(commit::TEXT) = MD5(payload::TEXT)); ALTER TABLE push_commits ALTER COLUMN commit_id SET NOT NULL, DROP COLUMN commit, DROP COLUMN id, ADD PRIMARY KEY (push_id, commit_id);
SELECT * FROM type_count('commits', 'payload');
SELECT * FROM key_count('commits', 'payload');
SELECT explode_json_column('commits', 'payload');
SELECT set_concrete_type('commits', 'distinct', 'BOOLEAN');
SELECT set_concrete_type('commits', 'message', 'TEXT');
SELECT set_concrete_type('commits', 'sha', 'TEXT');
SELECT set_concrete_type('commits', 'url', 'TEXT');
We could extract the authors table here.
SELECT * FROM type_count('commits', 'author');
SELECT * FROM key_count('commits', 'author');
But we won't, as it doesn't cover anything new.
ALTER TABLE commits DROP COLUMN payload; ALTER TABLE pushes DROP COLUMN payload, DROP COLUMN commits; UPDATE events SET payload = NULL WHERE type = 'PushEvent';
SELECT nspname || '.' || relname AS "relation",
pg_size_pretty(pg_relation_size(C.oid)) AS "size"
FROM pg_class C
LEFT JOIN pg_namespace N
ON (N.oid = C.relnamespace)
WHERE nspname NOT IN ('pg_catalog', 'information_schema')
ORDER BY pg_relation_size(C.oid) DESC;
This is a lot of data.
And that's just for an hour!
https://bigquery.cloud.google.com/table/githubarchive:day.yesterday
@jasonaowen
https://jasonaowen.github.io/pdxpug-incremental-schema https://github.com/jasonaowen/pdxpug-incremental-schema