souzatharsis / podcastfy

An Open Source Python alternative to NotebookLM's podcast feature: Transforming Multimodal Content into Captivating Multilingual Audio Conversations with GenAI
https://www.podcastfy.ai
Apache License 2.0
1.52k stars 166 forks source link

Add Evals #145

Open souzatharsis opened 3 weeks ago

souzatharsis commented 3 weeks ago

"evals are surprisingly often all you need"

But here we are evaluating a pretty novel dimension:

How can we systematically quantify generated text/audio is engaging, follows a target configuration while delivering accurate and informative content?

This is perhaps the most challenging yet most pressing issue to solve.

souzatharsis commented 3 weeks ago
import numpy as np
from typing import List, Dict, Any
from rouge_score import rouge_scorer
from spacy import load
from bert_score import score
from scipy.stats import entropy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from nltk.tokenize import sent_tokenize
import re

class AutomaticMetrics:
    """
    Automatic evaluation metrics for generated podcasts based on:
    - ROUGE [Lin, 2004]
    - Entity Coverage [Xu et al., 2023]
    - Dialogue Balance [Krishna et al., 2023]
    - Citation Verification [Gao et al., 2023]
    """
    def __init__(self):
        # Initialize NLP components
        self.nlp = load("en_core_web_sm")
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
        self.toxicity_model = AutoModelForSequenceClassification.from_pretrained(
            "martin-ha/toxic-comment-model"
        )
        self.toxicity_tokenizer = AutoTokenizer.from_pretrained(
            "martin-ha/toxic-comment-model"
        )

    def evaluate_content_coverage(
        self, 
        source_text: str, 
        generated_text: str
    ) -> Dict[str, float]:
        """
        Evaluate content coverage using ROUGE scores and entity overlap.
        Based on: "A Critical Evaluation of Evaluations for Long-form QA" 
        [Xu et al., 2023]
        """
        # Calculate ROUGE scores
        rouge_scores = self.rouge_scorer.score(source_text, generated_text)

        # Extract entities
        source_entities = set(self._extract_entities(source_text))
        generated_entities = set(self._extract_entities(generated_text))

        # Calculate entity coverage
        entity_recall = len(generated_entities.intersection(source_entities)) / len(source_entities) if source_entities else 0

        return {
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure,
            'entity_recall': entity_recall
        }

    def analyze_dialogue_structure(
        self, 
        transcript: str
    ) -> Dict[str, float]:
        """
        Analyze dialogue balance and structure.
        Based on: "LongEval: Guidelines for Human Evaluation of Faithfulness 
        in Long-form Summarization" [Krishna et al., 2023]
        """
        # Split into speaker turns
        turns = self._split_into_turns(transcript)

        # Calculate turn statistics
        speaker_counts = {}
        turn_lengths = []

        for turn in turns:
            speaker = turn['speaker']
            text = turn['text']

            speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
            turn_lengths.append(len(text.split()))

        # Calculate dialogue metrics
        total_turns = len(turns)
        turn_distribution = [count/total_turns for count in speaker_counts.values()]

        return {
            'speaker_entropy': entropy(turn_distribution),
            'avg_turn_length': np.mean(turn_lengths),
            'turn_length_variance': np.var(turn_lengths),
            'interaction_score': self._calculate_interaction_score(turns)
        }

    def verify_citations(
        self, 
        generated_text: str,
        sources: List[Dict[str, str]]
    ) -> Dict[str, float]:
        """
        Verify citation accuracy and coverage.
        Based on: "Enabling Large Language Models to Generate Text with Citations"
        [Gao et al., 2023]
        """
        # Extract cited statements
        citations = self._extract_citations(generated_text)

        # Verify each citation
        correct_citations = 0
        for citation in citations:
            statement = citation['statement']
            source_id = citation['source_id']
            source_text = next(
                (s['text'] for s in sources if s['id'] == source_id), 
                None
            )

            if source_text and self._verify_statement(statement, source_text):
                correct_citations += 1

        return {
            'citation_precision': correct_citations / len(citations) if citations else 0,
            'citation_density': len(citations) / len(sent_tokenize(generated_text))
        }

    def measure_engagement(
        self, 
        transcript: str
    ) -> Dict[str, float]:
        """
        Measure engagement through linguistic features.
        Based on: "What makes conversations engaging?" [Zhang et al., 2023]
        """
        # Analyze question frequency
        questions = len(re.findall(r'\?', transcript))

        # Calculate sentiment variance
        sentiments = self._analyze_sentiment_by_turn(transcript)

        # Check linguistic diversity
        vocab_richness = len(set(transcript.split())) / len(transcript.split())

        return {
            'question_density': questions / len(sent_tokenize(transcript)),
            'sentiment_variance': np.var(sentiments),
            'vocabulary_richness': vocab_richness,
            'toxicity_score': self._measure_toxicity(transcript)
        }

    def _extract_entities(self, text: str) -> List[str]:
        """Extract named entities using spaCy."""
        doc = self.nlp(text)
        return [ent.text for ent in doc.ents]

    def _split_into_turns(self, transcript: str) -> List[Dict[str, str]]:
        """Split transcript into speaker turns."""
        turns = []
        pattern = r'<(Person\d)>(.*?)</\1>'
        matches = re.finditer(pattern, transcript, re.DOTALL)

        for match in matches:
            speaker, text = match.groups()
            turns.append({
                'speaker': speaker,
                'text': text.strip()
            })

        return turns

    def _calculate_interaction_score(self, turns: List[Dict[str, str]]) -> float:
        """Calculate interaction score based on turn-taking patterns."""
        question_pattern = r'\?'
        interaction_pairs = 0

        for i in range(len(turns)-1):
            current_turn = turns[i]['text']
            next_turn = turns[i+1]['text']

            # Check if current turn has question and next turn responds
            if re.search(question_pattern, current_turn):
                interaction_pairs += 1

        return interaction_pairs / len(turns) if turns else 0

    def _verify_statement(self, statement: str, source: str) -> bool:
        """Verify if statement is supported by source using BERTScore."""
        precision, recall, f1 = score(
            [statement], 
            [source], 
            lang='en'
        )
        return f1.mean().item() > 0.6  # Threshold based on empirical testing

    def _analyze_sentiment_by_turn(self, transcript: str) -> List[float]:
        """Analyze sentiment for each turn."""
        turns = self._split_into_turns(transcript)
        sentiments = []

        for turn in turns:
            # Use toxicity model for sentiment approximation
            inputs = self.toxicity_tokenizer(
                turn['text'], 
                return_tensors="pt", 
                truncation=True
            )
            outputs = self.toxicity_model(**inputs)
            sentiment = torch.sigmoid(outputs.logits).item()
            sentiments.append(sentiment)

        return sentiments

    def _measure_toxicity(self, text: str) -> float:
        """Measure overall toxicity of the text."""
        inputs = self.toxicity_tokenizer(
            text, 
            return_tensors="pt", 
            truncation=True
        )
        outputs = self.toxicity_model(**inputs)
        return torch.sigmoid(outputs.logits).item()

# Example usage
def main():
    evaluator = AutomaticMetrics()

    # Example transcript
    transcript = """
    <Person1>Can you explain how neural networks learn?</Person1>
    <Person2>Neural networks learn through a process called backpropagation...</Person2>
    <Person1>That's interesting! How does backpropagation work exactly?</Person1>
    <Person2>Think of backpropagation as a way for the network to adjust its mistakes...</Person2>
    """

    # Example sources
    sources = [{
        'id': '1',
        'text': 'Neural networks learn through backpropagation, adjusting weights based on errors.'
    }]

    # Run evaluations
    coverage_scores = evaluator.evaluate_content_coverage(
        sources[0]['text'], 
        transcript
    )
    dialogue_scores = evaluator.analyze_dialogue_structure(transcript)
    citation_scores = evaluator.verify_citations(transcript, sources)
    engagement_scores = evaluator.measure_engagement(transcript)

    print("Evaluation Results:")
    print("Coverage:", coverage_scores)
    print("Dialogue:", dialogue_scores)
    print("Citations:", citation_scores)
    print("Engagement:", engagement_scores)

if __name__ == "__main__":
    main()
souzatharsis commented 3 weeks ago
from typing import List, Dict, Any
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
from collections import Counter

class EngagementMetrics:
    """
    Extended metrics for measuring conversation engagement based on:
    Zhang et al. "Unveiling the Secrets of Engaging Conversations"
    """
    def __init__(self):
        self.emotion_model = AutoModelForSequenceClassification.from_pretrained(
            "j-hartmann/emotion-english-distilroberta-base"
        )
        self.emotion_tokenizer = AutoTokenizer.from_pretrained(
            "j-hartmann/emotion-english-distilroberta-base"
        )

    def evaluate_engagement(self, transcript: str) -> Dict[str, float]:
        """Calculate comprehensive engagement metrics."""
        turns = self._split_into_turns(transcript)

        return {
            'turn_dynamics': self._measure_turn_dynamics(turns),
            'emotional_resonance': self._measure_emotional_resonance(turns),
            'content_depth': self._measure_content_depth(turns),
            'interactive_flow': self._measure_interactive_flow(turns)
        }

    def _measure_turn_dynamics(self, turns: List[Dict]) -> Dict[str, float]:
        """
        Measure turn-taking dynamics including:
        - Response latency patterns
        - Turn length variations
        - Speaker balance
        """
        turn_lengths = [len(turn['text'].split()) for turn in turns]
        speaker_counts = Counter(turn['speaker'] for turn in turns)

        return {
            'turn_length_mean': np.mean(turn_lengths),
            'turn_length_variance': np.var(turn_lengths),
            'speaker_balance': min(speaker_counts.values()) / max(speaker_counts.values()),
            'consistent_engagement': self._calculate_engagement_consistency(turn_lengths)
        }

    def _measure_emotional_resonance(self, turns: List[Dict]) -> Dict[str, float]:
        """
        Analyze emotional aspects including:
        - Emotion transitions
        - Sentiment synchronization
        - Emotional intensity
        """
        emotions = []
        for turn in turns:
            inputs = self.emotion_tokenizer(
                turn['text'],
                return_tensors="pt",
                truncation=True
            )
            outputs = self.emotion_model(**inputs)
            emotion_probs = torch.softmax(outputs.logits, dim=1)
            emotions.append(emotion_probs.detach().numpy())

        return {
            'emotion_coherence': self._calculate_emotion_coherence(emotions),
            'sentiment_alignment': self._calculate_sentiment_alignment(turns),
            'emotional_range': self._calculate_emotional_range(emotions)
        }

    def _measure_content_depth(self, turns: List[Dict]) -> Dict[str, float]:
        """
        Evaluate content quality including:
        - Topic development
        - Information density
        - Vocabulary sophistication
        """
        all_text = " ".join(turn['text'] for turn in turns)
        sentences = sent_tokenize(all_text)

        return {
            'vocab_diversity': len(set(all_text.split())) / len(all_text.split()),
            'topic_coherence': self._calculate_topic_coherence(turns),
            'information_density': self._calculate_information_density(sentences)
        }

    def _measure_interactive_flow(self, turns: List[Dict]) -> Dict[str, float]:
        """
        Analyze interaction patterns including:
        - Question-answer dynamics
        - Follow-up behaviors
        - Engagement signals
        """
        return {
            'question_answer_ratio': self._calculate_qa_ratio(turns),
            'followup_rate': self._calculate_followup_rate(turns),
            'engagement_signals': self._detect_engagement_signals(turns)
        }

    def _calculate_engagement_consistency(self, turn_lengths: List[int]) -> float:
        """Measure consistency of engagement through turn lengths."""
        if not turn_lengths:
            return 0.0
        # Calculate moving average of turn lengths
        window_size = 3
        moving_avgs = []
        for i in range(len(turn_lengths) - window_size + 1):
            window = turn_lengths[i:i + window_size]
            moving_avgs.append(np.mean(window))
        # Calculate variance of moving averages
        return 1 / (1 + np.var(moving_avgs))

    def _calculate_emotion_coherence(self, emotions: List[np.ndarray]) -> float:
        """Calculate emotional coherence between turns."""
        if len(emotions) < 2:
            return 0.0
        coherence_scores = []
        for i in range(len(emotions) - 1):
            similarity = np.dot(emotions[i][0], emotions[i+1][0])
            coherence_scores.append(similarity)
        return np.mean(coherence_scores)

    def _calculate_topic_coherence(self, turns: List[Dict]) -> float:
        """Measure topic development and maintenance."""
        # Simplified topic coherence using lexical overlap
        words_by_turn = [set(turn['text'].lower().split()) for turn in turns]
        coherence_scores = []
        for i in range(len(words_by_turn) - 1):
            overlap = len(words_by_turn[i] & words_by_turn[i+1])
            union = len(words_by_turn[i] | words_by_turn[i+1])
            coherence_scores.append(overlap / union if union > 0 else 0)
        return np.mean(coherence_scores)

    def _calculate_information_density(self, sentences: List[str]) -> float:
        """Calculate information density using sentence complexity."""
        # Using sentence length and unique words as proxy for information density
        densities = []
        for sent in sentences:
            words = sent.split()
            unique_ratio = len(set(words)) / len(words) if words else 0
            densities.append(len(words) * unique_ratio)
        return np.mean(densities)

    def _calculate_qa_ratio(self, turns: List[Dict]) -> float:
        """Calculate ratio of questions to answers."""
        questions = sum(1 for turn in turns if '?' in turn['text'])
        return questions / len(turns) if turns else 0

    def _detect_engagement_signals(self, turns: List[Dict]) -> float:
        """Detect explicit engagement signals in conversation."""
        engagement_markers = [
            r'\b(interesting|fascinating|tell me more|i see|wow|really|oh)\b',
            r'[!?]{2,}',
            r'\b(agree|exactly|absolutely|definitely)\b'
        ]
        total_signals = 0
        for turn in turns:
            for marker in engagement_markers:
                total_signals += len(re.findall(marker, turn['text'].lower()))
        return total_signals / len(turns) if turns else 0

    def _split_into_turns(self, transcript: str) -> List[Dict[str, str]]:
        """Split transcript into turns."""
        turns = []
        pattern = r'<(Person\d)>(.*?)</\1>'
        matches = re.finditer(pattern, transcript, re.DOTALL)
        for match in matches:
            speaker, text = match.groups()
            turns.append({
                'speaker': speaker,
                'text': text.strip()
            })
        return turns

# Example usage
def main():
    evaluator = EngagementMetrics()

    transcript = """
    <Person1>What's your take on artificial intelligence?</Person1>
    <Person2>AI is fascinating! It's revolutionizing how we solve complex problems.</Person2>
    <Person1>That's interesting! Could you give an example?</Person1>
    <Person2>Absolutely! Take medical diagnosis for instance. AI can now detect patterns in medical images with remarkable accuracy!</Person2>
    """

    results = evaluator.evaluate_engagement(transcript)
    print("Engagement Metrics:")
    for category, scores in results.items():
        print(f"\n{category.replace('_', ' ').title()}:")
        for metric, value in scores.items():
            print(f"  {metric}: {value:.3f}")

if __name__ == "__main__":
    main()
souzatharsis commented 2 weeks ago

LLM-as-a-judge sounds like an interesting, flexible and clean solution for this

https://huggingface.co/learn/cookbook/en/llm_judge

souzatharsis commented 2 weeks ago

Also, https://github.com/HLasse/TextDescriptives

for a metrics-based approach