mithrandie / csvq

SQL-like query language for csv
https://mithrandie.github.io/csvq
MIT License
1.5k stars 65 forks source link

Out of memory error #56

Closed jjconti closed 3 years ago

jjconti commented 3 years ago

I was running csvq and got an out of memory error. Is there a way to avoid this making? This was the command:

./csvq -f csv -o santafe.csv 'select * fromdatos_nomivac_covid19.csvwhere jurisdiccion_residencia="Santa Fe"'

The file was 1.6 GB. I was using the las build https://github.com/mithrandie/csvq/releases/download/v1.15.1/csvq-v1.15.1-linux-386.tar.gz

Error trace:


runtime: out of memory: cannot allocate 4194304-byte block (3778379776 in use)
fatal error: out of memory

goroutine 33 [running]:
runtime.throw(0x84f84b4, 0xd)
    runtime/panic.go:1117 +0x6a fp=0xc15e46a0 sp=0xc15e468c pc=0x807a5ba
runtime.(*mcache).refill(0xf1867538, 0x402)
    runtime/mcache.go:164 +0x271 fp=0xc15e46c8 sp=0xc15e46a0 pc=0x805c4e1
runtime.(*mcache).nextFree(0xf1867538, 0x2, 0xffffffff, 0x7f, 0x0)
    runtime/malloc.go:882 +0x79 fp=0xc15e46e4 sp=0xc15e46c8 pc=0x8053089
runtime.mallocgc(0x8, 0x84af820, 0x80b0a01, 0x7)
    runtime/malloc.go:1069 +0x706 fp=0xc15e472c sp=0xc15e46e4 pc=0x8053926
runtime.newobject(0x84af820, 0x886019c)
    runtime/malloc.go:1177 +0x2c fp=0xc15e4740 sp=0xc15e472c pc=0x8053abc
github.com/mithrandie/csvq/lib/value.glob..func1(0x8860190, 0x2)
    github.com/mithrandie/csvq/lib/value/pool.go:9 +0x23 fp=0xc15e474c sp=0xc15e4740 pc=0x816ff63
sync.(*Pool).Get(0x8860190, 0xfc299ca8, 0xa)
    sync/pool.go:148 +0x85 fp=0xc15e476c sp=0xc15e474c pc=0x80b0815
github.com/mithrandie/csvq/lib/value.getString(...)
    github.com/mithrandie/csvq/lib/value/pool.go:32
github.com/mithrandie/csvq/lib/value.NewString(...)
    github.com/mithrandie/csvq/lib/value/type.go:51
github.com/mithrandie/csvq/lib/query.readRecordSet.func1(0xc15ba280, 0x660949a9, 0x0, 0xc158c790, 0xc159a1fc, 0xc159a210)
    github.com/mithrandie/csvq/lib/query/view.go:1082 +0x2d1 fp=0xc15e47d8 sp=0xc15e476c pc=0x83d1801
runtime.goexit()
    runtime/asm_386.s:1315 +0x1 fp=0xc15e47dc sp=0xc15e47d8 pc=0x80a91c1
created by github.com/mithrandie/csvq/lib/query.readRecordSet
    github.com/mithrandie/csvq/lib/query/view.go:1071 +0x129

goroutine 1 [semacquire, 1 minutes]:
sync.runtime_Semacquire(0xc159a218)
    runtime/sema.go:56 +0x36
sync.(*WaitGroup).Wait(0xc159a210)
    sync/waitgroup.go:130 +0x7c
github.com/mithrandie/csvq/lib/query.readRecordSet(0x85a531c, 0xc1427560, 0x85a1a24, 0xc15a0190, 0x660949a9, 0x0, 0x305, 0x305, 0xc142e8c8, 0x1, ...)
    github.com/mithrandie/csvq/lib/query/view.go:1129 +0x1af
github.com/mithrandie/csvq/lib/query.loadViewFromCSVFile(0x85a531c, 0xc1427560, 0x85a3f18, 0xc159c2d8, 0xc158e180, 0x8050000, 0x85a6f14, 0xc158c770, 0xc15b5d0c, 0xc158c770, ...)
    github.com/mithrandie/csvq/lib/query/view.go:960 +0x15f
github.com/mithrandie/csvq/lib/query.loadViewFromFile(0x85a531c, 0xc1427560, 0xc14a8320, 0x85a3f18, 0xc159c2d8, 0xc158e180, 0xffffffff, 0x2c, 0x0, 0x0, ...)
    github.com/mithrandie/csvq/lib/query/view.go:860 +0xec
github.com/mithrandie/csvq/lib/query.cacheViewFromFile(0x85a531c, 0xc1427560, 0xc15a00a0, 0x85a6f14, 0xc158c710, 0x0, 0xffffffff, 0x2c, 0x0, 0x0, ...)
    github.com/mithrandie/csvq/lib/query/view.go:834 +0x747
github.com/mithrandie/csvq/lib/query.loadObject(0x85a531c, 0xc1427560, 0xc15a00a0, 0x85a6f14, 0xc158c710, 0xc158c700, 0xc1584080, 0x15, 0x0, 0x0, ...)
    github.com/mithrandie/csvq/lib/query/view.go:665 +0x2c7
github.com/mithrandie/csvq/lib/query.loadView(0x85a531c, 0xc1427560, 0xc15a00a0, 0x85a7374, 0xc158e120, 0x0, 0x0, 0x0, 0x0)
    github.com/mithrandie/csvq/lib/query/view.go:337 +0xf4d
github.com/mithrandie/csvq/lib/query.LoadView(0x85a531c, 0xc1427560, 0xc15a00a0, 0xc159c258, 0x1, 0x1, 0x0, 0x0, 0x0, 0xffffffff)
    github.com/mithrandie/csvq/lib/query/view.go:117 +0x16d
github.com/mithrandie/csvq/lib/query.selectEntity(0x85a531c, 0xc1427560, 0xc15a00a0, 0x85a72ac, 0xc15ba0c0, 0x0, 0x0, 0x0, 0x0)
    github.com/mithrandie/csvq/lib/query/query.go:198 +0xe3
github.com/mithrandie/csvq/lib/query.Select(0x85a531c, 0xc1427560, 0xc148c460, 0x0, 0x0, 0x0, 0x85a72ac, 0xc15ba0c0, 0x0, 0x0, ...)
    github.com/mithrandie/csvq/lib/query/query.go:126 +0x2c8
github.com/mithrandie/csvq/lib/query.(*Processor).ExecuteStatement(0xc1416ae0, 0x85a531c, 0xc1427560, 0x84df7a0, 0xc15a0050, 0x8479740, 0x859ab10, 0x0)
    github.com/mithrandie/csvq/lib/query/processor.go:189 +0x8c8
github.com/mithrandie/csvq/lib/query.(*Processor).execute(0xc1416ae0, 0x85a531c, 0xc1427560, 0xc159c288, 0x1, 0x1, 0xffffffff, 0x0, 0x839e4bb)
    github.com/mithrandie/csvq/lib/query/processor.go:94 +0x60
github.com/mithrandie/csvq/lib/query.(*Processor).Execute(0xc1416ae0, 0x85a531c, 0xc1427560, 0xc159c288, 0x1, 0x1, 0x0, 0x1, 0x0)
    github.com/mithrandie/csvq/lib/query/processor.go:83 +0xd6
github.com/mithrandie/csvq/lib/action.Run(0x85a531c, 0xc1427560, 0xc1416ae0, 0xfc63e3db, 0x52, 0x0, 0x0, 0xc15840a0, 0x1e, 0x0, ...)
    github.com/mithrandie/csvq/lib/action/run.go:59 +0x26d
main.main.func5(0x85a531c, 0xc1427560, 0xc1482160, 0xc1416ae0, 0x0, 0x0)
    github.com/mithrandie/csvq/main.go:259 +0x112
main.commandAction.func1(0xc1482160, 0x0, 0x0)
    github.com/mithrandie/csvq/main.go:332 +0x24e
github.com/urfave/cli.HandleAction(0x84703e0, 0xc140e770, 0xc1482160, 0x0, 0x0)
    github.com/urfave/cli@v1.20.0/app.go:490 +0x63
github.com/urfave/cli.(*App).Run(0xc1401420, 0xc1416060, 0x6, 0x6, 0x0, 0x0)
    github.com/urfave/cli@v1.20.0/app.go:264 +0x51c
main.main()
    github.com/mithrandie/csvq/main.go:265 +0x1509

goroutine 7 [syscall, 1 minutes]:
os/signal.signal_recv(0x0)
    runtime/sigqueue.go:168 +0x132
os/signal.loop()
    os/signal/signal_unix.go:23 +0x1a
created by os/signal.Notify.func1.1
    os/signal/signal.go:151 +0x33

goroutine 17 [chan receive, 1 minutes]:
main.commandAction.func1.2(0xc1456240, 0xc1514000, 0xc140e8f0)
    github.com/mithrandie/csvq/main.go:317 +0x39
created by main.commandAction.func1
    github.com/mithrandie/csvq/main.go:316 +0x1c1

goroutine 34 [chan send]:
github.com/mithrandie/csvq/lib/query.readRecordSet.func2(0x85a531c, 0xc1427560, 0xc159c2e0, 0x85a1a24, 0xc15a0190, 0x660949a9, 0x0, 0xc159a1fc, 0xc15ba280, 0xc159a210)
    github.com/mithrandie/csvq/lib/query/view.go:1122 +0x69
created by github.com/mithrandie/csvq/lib/query.readRecordSet
    github.com/mithrandie/csvq/lib/query/view.go:1099 +0x1a3
mithrandie commented 3 years ago

Depending on the content of the data, csvq will require at least 10 times more memory allocation than the target file size to keep all the data in memory at runtime. Can you check if your system's RAM and virtual memory have enough space to handle the file? If not, it is probably difficult to solve the problem and you should look for another way than csvq.

The actual amount of memory required to handle the data can be estimated by cutting the data to a size that can be processed and then running the same query with the --stats option.

$ head -n 1000000 foo.csv > bar.csv
$ csvq --stats 'SELECT COUNT(*) FROM `bar.csv`'
+----------+
| COUNT(*) |
+----------+
|   999999 |
+----------+
Query Execution Time: 0.818470 seconds

       Resource Statistics
---------------------------------
  TotalTime:    0.818675 seconds
 TotalAlloc: 351,134,832 bytes
    HeapSys: 334,561,280 bytes
    Mallocs:  10,002,885 objects
      Frees:   2,769,768 objects

HeapSys is the amount of memory required on the system for the execution of the query.