yaslab / CSV.swift

CSV reading and writing library written in Swift.
MIT License
661 stars 78 forks source link

Poor Read and Parsing Performance #119

Open xanderdunn opened 4 years ago

xanderdunn commented 4 years ago

I am reading billions of rows of data from gzipped CSV files exported from Google Cloud Compute's BigQuery. Here is a Swift script using CSV.swift to read a gzipped CSV file with 12 columns and 848,563 rows:

import Foundation

import CSV
import Gzip

// CSV.swift
// Takes 1m5s without decoding the objects
// Takes 5m15s when decoding to objects
func testReadBigQueryCSV() {
    let filePath = "data/my-data-export-2020-09-20-000000000161.csv.gz"
    let fileData: Data = try! Data(contentsOf: URL(fileURLWithPath: filePath))
    let decodedBody: Data = try! fileData.gunzipped()
    let stream = InputStream(data: decodedBody)
    let csv = try! CSVReader(stream: stream, hasHeaderRow: true)
    print(csv.headerRow!)
    var rows: [[String]] = []
    let decoder = CSVRowDecoder()
    decoder.userInfo[.knownFormatKey] = KnownFormat.BigQueryCSV
    for row in csv {
        rows.append(row)
    }
    /*var rows: [MyDecodableType] = []*/
    /*while csv.next() != nil {*/
        /*print("\(row)")*/
        /*let row: MyDecodableType = try! decoder.decode(MyDecodableType.self, from: csv)*/
        /*rows.append(row)*/
    /*}*/
    print("Got \(rows.count) rows")
}

When simply reading the CSV fields into an [[String]], it takes 1m5s. When decoding the Strings into types, it takes 5m15s.

For comparison, here is a Python script that reads and parses the same file in 3.6 seconds:

#!/usr/bin/env python3

import pandas as pd

# Takes 3.6 seconds when not parsing the dates
# Takes 2m43s when parsing the dates
def main():
    file_path = "data/my-data-export-2020-09-20-000000000161.csv.gz"
    print("Reading {}".format(file_path))
    # df = pd.read_csv(file_path, compression='gzip', parse_dates=["time", "timeUpdateReceived", "inserted"])
    df = pd.read_csv(file_path, compression='gzip')
    print("Read {} rows".format(df.shape))
    print(df.columns)
    print(df.dtypes)
    print(df.iloc[0])

if __name__ == "__main__":
    main()

The comparison is 3.6s vs 1m5s and 2m43s vs 5m15s. That's an 18x slower read. pandas also uses a single CPU core.