Open MJedr opened 1 year ago
Create a reusable ES loader. It should be able to connect to ES with credentials and load data:
This code snipped might be useful to create an ES connection
from elasticsearch_dsl import Search from typing import List, Optional from elasticsearch_dsl.connections import connections from elasticsearch import Elasticsearch class LiteratureSearch(Search): es_username = os.environ.get('ES_USERNAME') es_password = os.environ.get('ES_PASSWORD') _connection = None def __init__(self, index: str, **kwargs): super().__init__( using=self._get_connection(), index=index, ) @classmethod def _get_connection(cls) -> Elasticsearch: if cls._connection is not None: return cls._connection cls._connection = connections.create_connection( hosts=[''], http_auth=(cls.es_username, cls.es_password), use_ssl=True, verify_certs=False, timeout=5, ) return cls._connection
After using the above class, I collected the raw documents (1.71 GB) from records-hep, we can process with filtering and create the dataset for the model.
1.71 GB
records-hep
Create a reusable ES loader. It should be able to connect to ES with credentials and load data:
This code snipped might be useful to create an ES connection