Open TeorinKim opened 3 years ago
docs = [["python", "is", "a", "programming", "language"], ["programming", "is", "fun"], ["python", "is", "easy"]]
indptr = [0] indices = [] data = [] vocabulary = {}
for d in docs: for term in d: index = vocabulary.setdefault(term, len(vocabulary)) indices.append(index) data.append(1) indptr.append(len(indices))
for k, v in vocabulary.items(): print(k, ':', v)
term_document_csr_mat = csr_matrix((data, indices, indptr), dtype=int) term_document_csr_mat
print(term_document_csr_mat)
print('-- SciPy Compressed Sparse Row matrix --')
print('indptr:', term_document_csr_mat.indptr)
print('indices:', term_document_csr_mat.indices)
print('data:', term_document_csr_mat.data)
term_document_arr = term_document_csr_mat.toarray() # or todense()
term_document_arr
To construct a CSR matrix incrementally
docs = [["python", "is", "a", "programming", "language"], ["programming", "is", "fun"], ["python", "is", "easy"]]
indptr = [0] indices = [] data = [] vocabulary = {}
for d in docs: for term in d: index = vocabulary.setdefault(term, len(vocabulary)) indices.append(index) data.append(1) indptr.append(len(indices))
for k, v in vocabulary.items(): print(k, ':', v)
term_document_csr_mat = csr_matrix((data, indices, indptr), dtype=int) term_document_csr_mat
print(term_document_csr_mat)
print('-- SciPy Compressed Sparse Row matrix --')
print('indptr:', term_document_csr_mat.indptr)
print('indices:', term_document_csr_mat.indices)
print('data:', term_document_csr_mat.data)
term_document_arr = term_document_csr_mat.toarray() # or todense()
term_document_arr