ariansajina / master-thesis

MIT License
1 stars 0 forks source link

Collect references as .bib #7

Open ariansajina opened 3 years ago

ariansajina commented 3 years ago

https://arxiv.org/abs/1607.05368

why 300 dimensions for word2vec? read here

@article{DBLP:journals/corr/LauB16, author = {Jey Han Lau and Timothy Baldwin}, title = {An Empirical Evaluation of doc2vec with Practical Insights into Document Embedding Generation}, journal = {CoRR}, volume = {abs/1607.05368}, year = {2016}, url = {http://arxiv.org/abs/1607.05368}, archivePrefix = {arXiv}, eprint = {1607.05368}, timestamp = {Mon, 13 Aug 2018 16:48:35 +0200}, biburl = {https://dblp.org/rec/journals/corr/LauB16.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }

ariansajina commented 3 years ago

@article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, journal={Journal of Machine Learning Research}, volume={12}, pages={2825--2830}, year={2011} }

ariansajina commented 3 years ago

@inproceedings{nothman-etal-2018-stop, title = "Stop Word Lists in Free Open-source Software Packages", author = "Nothman, Joel and Qin, Hanmin and Yurchak, Roman", booktitle = "Proceedings of Workshop for {NLP} Open Source Software ({NLP}-{OSS})", month = jul, year = "2018", address = "Melbourne, Australia", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W18-2502", doi = "10.18653/v1/W18-2502", pages = "7--12", abstract = "Open-source software packages for language processing often include stop word lists. Users may apply them without awareness of their surprising omissions (e.g. {}hasn{'}t{''} but not {}hadn{'}t{''}) and inclusions ({``}computer{''}), or their incompatibility with a particular tokenizer. Motivated by issues raised about the Scikit-learn stop list, we investigate variation among and consistency within 52 popular English-language stop lists, and propose strategies for mitigating these issues.", }

ariansajina commented 3 years ago

see #10

ariansajina commented 3 years ago

central paper on word2vec (method, not implementation)

@misc{mikolov2013distributed, title={Distributed Representations of Words and Phrases and their Compositionality}, author={Tomas Mikolov and Ilya Sutskever and Kai Chen and Greg Corrado and Jeffrey Dean}, year={2013}, eprint={1310.4546}, archivePrefix={arXiv}, primaryClass={cs.CL} }

ariansajina commented 3 years ago

Note that there is no guarantee that we find the best possible arrangement of words, since word embeddings start randomly and are changed piecemeal. Because of this stochastic property, it is recommended to train several embed- ding models on the same data, and then average the resulting embedding vectors for each word over all of the models.

@article{10.1162/tacl_a_00008, author = {Antoniak, Maria and Mimno, David}, title = "{Evaluating the Stability of Embedding-based Word Similarities}", journal = {Transactions of the Association for Computational Linguistics}, volume = {6}, pages = {107-119}, year = {2018}, month = {02}, abstract = "{Word embeddings are increasingly being used as a tool to study word associations in specific corpora. However, it is unclear whether such embeddings reflect enduring properties of language or if they are sensitive to inconsequential variations in the source documents. We find that nearest-neighbor distances are highly sensitive to small changes in the training corpus for a variety of algorithms. For all methods, including specific documents in the training set can result in substantial variations. We show that these effects are more prominent for smaller training corpora. We recommend that users never rely on single embedding models for distance calculations, but rather average over multiple bootstrap samples, especially for small corpora.}", issn = {2307-387X}, doi = {10.1162/tacl_a_00008}, url = {https://doi.org/10.1162/tacl\_a\_00008}, eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00008/1567586/tacl\_a\_00008.pdf}, }

ariansajina commented 3 years ago

doc2vec original paper

@InProceedings{pmlr-v32-le14, title = {Distributed Representations of Sentences and Documents}, author = {Quoc Le and Tomas Mikolov}, booktitle = {Proceedings of the 31st International Conference on Machine Learning}, pages = {1188--1196}, year = {2014}, editor = {Eric P. Xing and Tony Jebara}, volume = {32}, number = {2}, series = {Proceedings of Machine Learning Research}, address = {Bejing, China}, month = {22--24 Jun}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v32/le14.pdf}, url = {http://proceedings.mlr.press/v32/le14.html}, abstract = {Many machine learning algorithms require the input to be represented as a fixed length feature vector. When it comes to texts, one of the most common representations is bag-of-words. Despite their popularity, bag-of-words models have two major weaknesses: they lose the ordering of the words and they also ignore semantics of the words. For example, "powerful," "strong" and "Paris" are equally distant. In this paper, we propose an unsupervised algorithm that learns vector representations of sentences and text documents. This algorithm represents each document by a dense vector which is trained to predict words in the document. Its construction gives our algorithm the potential to overcome the weaknesses of bag-of-words models. Empirical results show that our technique outperforms bag-of-words models as well as other techniques for text representations. Finally, we achieve new state-of-the-art results on several text classification and sentiment analysis tasks.} }