Closed fabriziosalmi closed 4 days ago
yes I planned to go thru pre and post filter, relying on simple external regex files, that way will be easy to block harmful content or just format stuff in a proper way while needed :)
hand on over this of course :)
I think that there are lots of options for rss filtering out there doing this internally really isn't super necessary imo i doubt anything particularly harmful can come out.
initial tentative to move out from main processor script..
To move the cleaned_content
filters to filters.py
, you can define a new function in filters.py
that handles the cleaning of the content. Here's how you can do this:
filters.py
to include the new filtering function:# filters.py
import re
def ensure_proper_punctuation(text):
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
corrected_sentences = []
for sentence in sentences:
sentence = sentence.strip()
if sentence and not sentence.endswith('.'):
sentence += '.'
corrected_sentences.append(sentence)
return ' '.join(corrected_sentences)
def clean_content(content):
# Remove unnecessary asterisks
content = re.sub(r'\*\*', '', content)
# Replace multiple newlines with a single space
content = re.sub(r'\n\n+', ' ', content)
# Remove "Fonti:" lines
content = re.sub(r'Fonti:.*$', '', content, flags=re.MULTILINE)
# Remove "Fonte:" lines
content = re.sub(r'Fonte:.*$', '', content, flags=re.MULTILINE)
# Ensure proper punctuation at the end of sentences
content = ensure_proper_punctuation(content)
return content
clean_content
function:import re
import json
import requests
import logging
import argparse
import yaml
from pathlib import Path
from datetime import datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from openai import OpenAI
from filters import ensure_proper_punctuation, clean_content # Import the functions
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def requests_retry_session(retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def call_openai_api(api_url, combined_content, model, api_key):
client = OpenAI(api_key=api_key)
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a professional assistant, skilled in composing detailed and accurate news articles from multiple sources."},
{"role": "user", "content": combined_content}
]
)
return response.choices[0].message.content
except Exception as e:
logging.error(f"OpenAI API request failed: {e}")
return None
def call_ollama_api(api_url, combined_content, model):
data = json.dumps({
"model": model,
"messages": [{"role": "user", "content": combined_content}],
"stream": False
})
try:
response = requests_retry_session().post(api_url, data=data, headers={'Content-Type': 'application/json'})
response.raise_for_status()
try:
response_json = response.json()
logging.debug(f"Ollama API response: {response_json}")
return response_json['message']['content']
except json.JSONDecodeError as e:
logging.error(f"Failed to parse JSON response from Ollama API: {e}")
logging.error(f"Response content: {response.text}")
return None
except requests.RequestException as e:
logging.error(f"Ollama API request failed: {e}")
return None
def process_json_file(filepath, api_url, model, api_key, content_prefix, rewritten_folder, api_type):
try:
with open(filepath, 'r', encoding='utf-8') as file:
json_data = json.load(file)
except (json.JSONDecodeError, IOError) as e:
logging.error(f"Error reading JSON from {filepath}: {e}")
return
combined_content = content_prefix + "\n".join(
f"[source {idx + 1}] {item['content']}" for idx, item in enumerate(json_data))
logging.info(f"Processing {filepath} - combined content prepared.")
logging.debug(f"Combined content: {combined_content}")
if api_type == "openai":
rewritten_content = call_openai_api(api_url, combined_content, model, api_key)
else:
rewritten_content = call_ollama_api(api_url, combined_content, model)
if rewritten_content:
cleaned_content = clean_content(rewritten_content) # Use the new clean_content function
links = [item.get('link') for item in json_data if 'link' in item]
current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
new_data = {
'title': json_data[0].get('title', 'No Title'),
'content': cleaned_content,
'processed_at': current_datetime,
'links': links,
'api': api_type,
'model': model
}
new_filename = Path(rewritten_folder) / (Path(filepath).stem + '_rewritten.json')
try:
with open(new_filename, 'w', encoding='utf-8') as outfile:
json.dump(new_data, outfile, ensure_ascii=False, indent=4)
logging.info(f"Rewritten file saved to {new_filename}")
except IOError as e:
logging.error(f"Error writing to {new_filename}: {e}")
else:
logging.error("Failed to get rewritten content from LLM API.")
logging.debug(f"Rewritten content: {rewritten_content}")
def validate_config(api_config):
if 'openai_api_url' in api_config and 'ollama_api_url' in api_config:
raise ValueError("Both OpenAI and Ollama API configurations are set. Please configure only one.")
if 'openai_api_url' not in api_config and 'ollama_api_url' not in api_config:
raise ValueError("Neither OpenAI nor Ollama API configuration is set. Please configure one.")
if 'openai_api_url' in api_config and ('openai_api_key' not in api_config or 'openai_model' not in api_config):
raise ValueError("OpenAI API configuration is incomplete. Please set the API URL, API key, and model.")
if 'ollama_api_url' in api_config and 'ollama_model' not in api_config:
raise ValueError("Ollama API configuration is incomplete. Please set the API URL and model.")
def main():
parser = argparse.ArgumentParser(description='Process JSON files and call LLM API.')
parser.add_argument('--config', type=str, help='Path to the configuration file.', default='config.yaml')
args = parser.parse_args()
config_path = args.config if args.config else 'config.yaml'
with open(config_path, 'r') as config_file:
config = yaml.safe_load(config_file)
api_config = config['api_config']
folders = config['folders']
content_prefix = config['content_prefix']
validate_config(api_config)
rewritten_folder = Path(folders['rewritten_folder'])
rewritten_folder.mkdir(parents=True, exist_ok=True)
output_folder = Path(folders['output_folder'])
json_files = list(output_folder.glob('*.json'))
if not json_files:
logging.info("No JSON files found in the output folder.")
return
if 'openai_api_url' in api_config:
api_url = api_config['openai_api_url']
model = api_config['openai_model']
api_key = api_config['openai_api_key']
api_type = 'openai'
else:
api_url = api_config['ollama_api_url']
model = api_config['ollama_model']
api_key = None
api_type = 'ollama'
for filepath in json_files:
logging.info(f"Processing file: {filepath}")
process_json_file(filepath, api_url, model, api_key, content_prefix, rewritten_folder, api_type)
if __name__ == "__main__":
main()
In this updated script, the clean_content
function is now part of filters.py
and handles the cleaning of the rewritten content. The main script imports and uses this function to process the content before saving it to a new file. This modular approach enhances code readability and maintainability.
@Leopere
I think that there are lots of options for rss filtering out there doing this internally really isn't super necessary imo i doubt anything particularly harmful can come out.
incoming stuff che be easily filtered with 3rd party tools like RSSHub or many others for general content.. outgoing stuff can be useful to have some quick solutions out of the box.. let's see what's goin' on :)
are you suggesting users code python to setup filters? That seems a bit convoluted. Like personally I came here kinda hoping I would be able to point this thing at some feeds and have certain feeds be processed by certain system prompts that I would set. I think that hardcoding stuff in is kind of overly opinionated. Perhaps I misunderstood based on your first post publicising this.
for the incoming stuff (rss items aggregation) i prefer atm to leave out the feature for more robust solutions like RSSHub.
for the outgoing stuff (rewritten items) i started to move logic out of llm_processor.py, to filters.py but of course the intent is to gradually move everything in the config.yaml
i am wondering sometime i have clear objective for some portion of the app but i love to make changes flawlessly as much as i can, this because the “bridge” to filters.py in first instance :)
last but not least i deliberately delayed some features (like dockerizing the ugly) until the code is more close to my goals :)
I mean with docker you don’t even need a config.yml
yes I know just pass as run params will be fine (input file path and others options).. just delayed atm but u are free to propose a version of course :)
dockerized and updated a bit, still play on filter improvments :)
Filters can be done by anything else including UglyFeed itself if this thing can publish it's own feeds. Just have the output go through something else that filters. Focus on this being excellent.