Hi Davis,
I was trying to make a total_word_feature_extractor for 4.5G Russian text corpus (it's not so much IMO). Unfortunately, wordrep exits with std::bad_alloc exception.
Why it's needed in so huge memory - more than 24GB?
7 0x00000000004f1f26 in dlib::memory_manager_stateless_kernel_1::allocate_array (this=0x7fffffffa288, size=6500000000) at /usr/local/include/dlib/memory_manager_stateless/memory_manager_stateless_kernel_1.h:53
8 0x00000000004e956b in dlib::row_major_layout::layout<float, 0l, 0l,
dlib::memory_manager_stateless_kernel_1, 5>::set_size (this=0x7fffffffa270, nr=50000000, nc=130) at /usr/local/include/dlib/matrix/matrix_data_layout.h:506
9 0x00000000004e88d5 in dlib::matrix<float, 0l, 0l, dlib::memory_manager_stateless_kernel_1, dlib::row_major_layout>::set_size (this=0x7fffffffa270, rows=50000000, cols=130) at /usr/local/include/dlib/matrix/matrix.h:1375
extra_rank=40, q=5, regularization=0) at /usr/local/include/dlib/statistics/cca.h:161
14 0x0000000000517064 in do_cca_on_windows<mitie::group_tokenizer > (vocab=std::map with 200000 elements = {...}, window_size=9, num_contexts=50000000, num_correlations=90, tok=..., Ltrans=..., Rtrans=...) at /home/max/Documents/devel/MITIE/tools/wordrep/src/word_vects.cpp:122
15 0x0000000000515840 in word_vects (parser=...) at /home/max/Documents/devel/MITIE/tools/wordrep/src/word_vects.cpp:226
Hi Davis, I was trying to make a total_word_feature_extractor for 4.5G Russian text corpus (it's not so much IMO). Unfortunately, wordrep exits with std::bad_alloc exception. Why it's needed in so huge memory - more than 24GB?
7 0x00000000004f1f26 in dlib::memory_manager_stateless_kernel_1::allocate_array (this=0x7fffffffa288, size=6500000000) at /usr/local/include/dlib/memory_manager_stateless/memory_manager_stateless_kernel_1.h:53
8 0x00000000004e956b in dlib::row_major_layout::layout<float, 0l, 0l,
dlib::memory_manager_stateless_kernel_1, 5>::set_size (this=0x7fffffffa270, nr=50000000, nc=130) at /usr/local/include/dlib/matrix/matrix_data_layout.h:506
9 0x00000000004e88d5 in dlib::matrix<float, 0l, 0l, dlib::memory_manager_stateless_kernel_1, dlib::row_major_layout>::set_size (this=0x7fffffffa270, rows=50000000, cols=130) at /usr/local/include/dlib/matrix/matrix.h:1375
10 0x000000000051fdcf in dlib::find_matrix_range<std::vector<std::pair<unsigned int, float>, std::allocator<std::pair<unsigned int, float> > >, float, dlib::memory_manager_stateless_kernel_1, dlib::row_major_layout> (A=std::vector of length 50000000, capacity 50000000 = {...}, l=130, Q=..., q=5)
11 0x000000000051c6ea in dlib::svd_fast<std::vector<std::pair<unsigned int, float>, std::allocator<std::pair<unsigned int, float> > >, float, 0l, 0l, 0l, 1l, 0l, 0l, dlib::memory_manager_stateless_kernel_1, dlib::row_major_layout> (A=std::vector of length 50000000, capacity 50000000 = {...}, u=..., w=...,
12 0x000000000051a0b1 in dlib::impl_cca<std::vector<std::vector<std::pair<unsigned int, float>, std::allocator<std::pair<unsigned int, float> > >, std::allocator<std::vector<std::pair<unsigned int, float>, std::allocator<std::pair<unsigned int, float> > > > >, float> (
13 0x00000000005184f3 in dlib::cca<std::vector<std::pair<unsigned int, float>, std::allocator<std::pair<unsigned int, float> > >, float> (L=std::vector of length 50000000, capacity 50000000 = {...}, R=std::vector of length 50000000, capacity 50000000 = {...}, Ltrans=..., Rtrans=..., num_correlations=90,
14 0x0000000000517064 in do_cca_on_windows<mitie::group_tokenizer > (vocab=std::map with 200000 elements = {...}, window_size=9, num_contexts=50000000, num_correlations=90, tok=..., Ltrans=..., Rtrans=...) at /home/max/Documents/devel/MITIE/tools/wordrep/src/word_vects.cpp:122
15 0x0000000000515840 in word_vects (parser=...) at /home/max/Documents/devel/MITIE/tools/wordrep/src/word_vects.cpp:226