Open darkpowerxo opened 3 months ago
I was thinking if we just do the latest git clone of the docs repo into users system and then do the indexing. This would solve many issues.
{
"name": "github.com/repo"
"type": "github" // local or github (github indicates clone)
"docsPath": "github.com/repo/docs-folder"
}
cc: @Patrick-Erichsen
this wouldn't work if the doc isn't on github, i found a complicated solution for now where I use rust and random sleep timers to crawl the data and download them offline, and then pass my local data to continue.dev.
if anyone is interested here is the rust code.
use mysql::prelude::*;
use mysql::{Pool};
use scraper::{Html, Selector};
use std::error::Error;
use std::time::Duration;
use std::fmt;
use thirtyfour::prelude::*;
use thirtyfour::error::WebDriverError;
use thirtyfour::error::WebDriverErrorType;
use reqwest::Client;
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
#[derive(Debug, PartialEq, Eq)]
struct Event {
date: String,
time: String,
currency: String,
impact: String,
event: String,
actual: String,
forecast: String,
previous: String,
}
impl Event {
fn new(date: String, time: String, currency: String, impact: String, event: String, actual: String, forecast: String, previous: String) -> Self {
Event {
date,
time,
currency,
impact,
event,
actual,
forecast,
previous,
}
}
}
#[derive(Debug)]
struct MyError(String);
impl fmt::Display for MyError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "There is an error: {}", self.0)
}
}
impl Error for MyError {}
fn mysql_insert(body: &str) -> Result<(), Box<dyn std::error::Error>> {
let document = Html::parse_document(body);
let event_selector = Selector::parse("tr[id*=eventRowId]").unwrap();
let event_list = document.select(&event_selector);
let mut events = Vec::new();
for event_element in event_list {
if event_element.value().attr("event_timestamp") != None {
let date_time_element = event_element.value().attr("event_timestamp").unwrap().to_string();
let date = date_time_element.split(' ').next().unwrap().to_string();
let time = event_element.select(&Selector::parse(".time").unwrap()).next().unwrap().text().collect::<String>();
let currency = event_element.select(&Selector::parse(".flagCur").unwrap()).next().unwrap().text().collect::<String>();
let sentiment_element = event_element.select(&Selector::parse(".sentiment").unwrap()).next().unwrap();
let impact = format!("{}", sentiment_element.value().attr("title").unwrap_or(""));
let event = event_element.select(&Selector::parse(".event").unwrap()).next().unwrap().text().collect::<String>();
let actual = event_element.select(&Selector::parse(".act").unwrap()).next().unwrap().text().collect::<String>();
let forecast = event_element.select(&Selector::parse(".fore").unwrap()).next().unwrap().text().collect::<String>();
let previous = event_element.select(&Selector::parse(".prev").unwrap()).next().unwrap().text().collect::<String>();
if !previous.find(char::is_whitespace).is_some() {
if impact.contains("High") || impact.contains("Moderate") || impact.contains("Low ") {
println!("{:?}", impact);
let event = Event::new(date, time, currency, impact, event, actual, forecast, previous);
events.push(event);
}
}
}
}
// let pool = Pool::new("mysql://<MY MYSQL INFO>:3306/economic_calendar")?;
// let mut conn = pool.get_conn()?;
// for event in events {
// conn.exec_drop(
// r"INSERT INTO economic_calendar (date, time, currency, impact, event, actual, forecast, previous)
// VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
// (event.date, event.time, event.currency, event.impact, event.event, event.actual, event.forecast, event.previous)
// )?;
// }
Ok(())
}
#[tokio::main]
async fn main() -> WebDriverResult<()> {
// Launch a headless Chrome browser using `selenium-webdriver`
// Set up the WebDriver
let caps = DesiredCapabilities::chrome();
let driver = WebDriver::new("http://localhost:9515", caps).await?;
// Navigate to the page
driver.goto("https://sslecal2.investing.com?columns=exc_flags,exc_currency,exc_importance,exc_actual,exc_forecast,exc_previous&category=_employment,_economicActivity,_inflation,_credit,_centralBanks,_confidenceIndex,_balance,_Bonds&features=datepicker,timezone,timeselector,filters&countries=29,25,34,32,6,232,27,37,122,15,107,55,24,72,71,22,17,39,14,10,35,103,7,43,38,100,56,52,36,110,11,26,9,12,63,61,143,4,5&calType=week&timeZone=8&lang=1").await?;
// Wait for the page to load
driver.set_page_load_timeout(Duration::from_secs(10)).await?;
// dateFrom=2023-04-03 dateTo=
// driver.sen
// Extract the page source from the browser and use it as the response body
let body = driver.source().await?;
let response = Client::new().post("https://sslecal2.investing.com/ajax.php")
.header(USER_AGENT, "Mozilla/5.0")
.header(CONTENT_TYPE, "application/x-www-form-urlencoded")
.body("dateFrom=2022-11-06&dateTo=2022-11-12")
.send()
.await
.map_err(|err| format!("Error: {}", err) )?;
println!("Response: {:?}", response);
//println!("{:?}", mysql_insert(&body));
driver.quit().await?;
Ok(())
}
my isue isn't with this site. but i just used it as an example
Before submitting your bug report
Relevant environment info
Description
We need more options to slow down the crawler speed and how many pages it scan at the same time. This is because many sites that i try to add to @docs => Add a documentation site, it would try to do it as fast as possible and then since the sites I try to have their docs have protection against ddos attacks block me, and i can't even go to that website anymore.
To reproduce
Failed to index "site url" notification
Log output