Open munmun319 opened 8 months ago
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.bash import BashOperator
import os
absolute_path = os.path.abspath(file) scripts_dir = os.path.dirname(absolute_path) scraper_script = os.path.join(scripts_dir, 'scraper_to_csv.py') mongo_script = os.path.join(scripts_dir, 'csv_to_mongoDB.py')
default_args = { 'depends_on_past': False, 'email': ['sangmun_kim@sfu.ca'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), }
with DAG( 'nhl_reddit_ETL', default_args=default_args, description='A simple DAG to scrape NHL data and save to MongoDB', schedule=timedelta(days=1), start_date=datetime(2024, 3, 28), catchup=False, ) as dag:
t1 = BashOperator(
task_id='scraper_to_csv',
bash_command=f"python3 {scraper_script}",
)
t2 = BashOperator(
task_id='csv_to_mongoDB',
bash_command=f"python3 {mongo_script}",
)
t1 >> t2
### Result
![image](https://github.com/BettTer/Hockey_Salary_Research/assets/72590773/a07408f7-6c1c-43b8-8968-f28c3fad8687)
- Created DAG(Directed Acyclic Graph) of two tasks(scraping and loading to DB), and automated it to process it daily.
### Future plan
- Add sentiment analysis task to the current two tasks, so that the whole serial process can be done in the pipeline.
Objective
Implement sentiment analysis to add another aspect to the player evaluation to find correlation between the sentiment score, actual salary, and the predicted salary.
Goal