Open f-hafner opened 1 year ago
did some simple tests with the following query:
SELECT AuthorId, AffiliationId, Year
FROM (
SELECT *,
MAX(PaperCount) OVER(PARTITION BY AuthorId, Year) AS MaxPaperCount
FROM (
SELECT a.AuthorId,
a.AffiliationId,
c.Year,
count(PaperId) AS PaperCount
FROM PaperAuthorAffiliations a
INNER JOIN (
SELECT AuthorId
FROM author_sample
) b USING (AuthorId)
INNER JOIN (
SELECT PaperId, Year
FROM Papers
WHERE DocType IN ('Journal', 'Conference')
) c USING (PaperId)
WHERE AffiliationId != ''
GROUP BY a.AuthorId, a.AffiliationId, c.Year
)
)
WHERE PaperCount = MaxPaperCount
When run through sqlite, this takes 700s; when run through duckdb, it takes 77s.
The syntax for duckb is
SET GLOBAL sqlite_all_varchar = true;
load sqlite;
attach 'path/to/db/AcademicGraph.sqlite' (type sqlite);
use AcademicGraph;
show tables;
/* then execute the query above */
open questions
sqlite_all_varchar = true
statement returns all CHAR for the output of the query. we should aim to avoid this statement , and for this it may be necessary to change some data, for instance replace ""
with NULL
in INT
columns in python, this works: https://gist.github.com/f-hafner/f49ee843b61d395adf1afc532df8e5db took 46s (it did not matter whether a temp table was created or not)
how about setting storing the entire data in duckdb instead? with a small-ish example, having a duckdb file with the following schema:
CREATE TABLE PaperAuthorAffiliations(PaperId BIGINT, AuthorId BIGINT, AffiliationId BIGINT);
CREATE TABLE Papers(PaperId BIGINT, DocType VARCHAR, "Year" BIGINT);
CREATE TABLE author_sample(AuthorId BIGINT, YearLastPub INTEGER, YearFirstPub INTEGER, PaperCount INTEGER, FirstName VARCHAR);
(this schema is created with the following inter-acting queries on the sqlite database and on the duckdb database):
/*make table author_sample */
/* -------------------------------------------------------------------------*/
/* in sqlite */
create table flavio_test (AuthorId INT, YearLastPub INT, YearFirstPub INT, PaperCount INT, FirstName VARCHAR);
insert into flavio_test
select * from author_sample;
/* in duckdb */
/* I think the install and load of sqlite is missing here */
create table author_sample(AuthorId bigint, YearLastPub INT, YearFirstPub INT, PaperCount INT, FirstName varchar);
insert into author_sample
select * from sqlite_scan("/mnt/ssd/AcademicGraph/AcademicGraph.sqlite", "flavio_test");
/* make table Papers*/
/* -------------------------------------------------------------------------*/
/* in sqlite */
drop table flavio_test;
create table flavio_test(PaperId INT, DocType TEXT, Year INT);
insert into flavio_test
select PaperId, DocType, Year from Papers
where DocType != "" and Year is not NULL and Year != "";
/* in duckdb */
install sqlite;
load sqlite;
create table papers as select * from sqlite_scan("/mnt/ssd/AcademicGraph/AcademicGraph.sqlite", "flavio_test");
/* make table PaperAuthorAffiliations*/
/* -------------------------------------------------------------------------*/
/* in sqlite */
drop table flavio_test;
create table flavio_test(PaperId INT, AuthorId INT, AffiliationId INT);
insert into flavio_test
select PaperId, AuthorId, AffiliationId
from PaperAuthorAffiliations
where AffiliationId != "";
/* in duckdb */
create table PaperAuthorAffiliations as select * from sqlite_scan("/mnt/ssd/AcademicGraph/AcademicGraph.sqlite", "flavio_test");
then the following query takes 11 seconds
create temp table flavio_test as
SELECT AuthorId, AffiliationId, Year
FROM (
SELECT *,
MAX(PaperCount) OVER(PARTITION BY AuthorId, Year) AS MaxPaperCount
FROM (
SELECT a.AuthorId,
a.AffiliationId,
c.Year,
count(PaperId) AS PaperCount
FROM PaperAuthorAffiliations a
INNER JOIN (
SELECT AuthorId
FROM author_sample
) b USING (AuthorId)
INNER JOIN (
SELECT PaperId, Year
FROM Papers
WHERE DocType IN ('Journal', 'Conference')
) c USING (PaperId)
GROUP BY a.AuthorId, a.AffiliationId, c.Year
)
)
WHERE PaperCount = MaxPaperCount ;
I dropped the WHERE AffiliationId != ''
clause because it is already enforced through duckdb's type requirements (which implies the speed is not perfectly comparable to the speeds above)
instead of using pandas in-memory with batches. See here: https://motherduck.com/blog/analyze-sqlite-databases-duckdb/
do some speed comparisons