Open souzatharsis opened 2 days ago
import numpy as np
from typing import List, Dict, Any
from rouge_score import rouge_scorer
from spacy import load
from bert_score import score
from scipy.stats import entropy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from nltk.tokenize import sent_tokenize
import re
class AutomaticMetrics:
"""
Automatic evaluation metrics for generated podcasts based on:
- ROUGE [Lin, 2004]
- Entity Coverage [Xu et al., 2023]
- Dialogue Balance [Krishna et al., 2023]
- Citation Verification [Gao et al., 2023]
"""
def __init__(self):
# Initialize NLP components
self.nlp = load("en_core_web_sm")
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
self.toxicity_model = AutoModelForSequenceClassification.from_pretrained(
"martin-ha/toxic-comment-model"
)
self.toxicity_tokenizer = AutoTokenizer.from_pretrained(
"martin-ha/toxic-comment-model"
)
def evaluate_content_coverage(
self,
source_text: str,
generated_text: str
) -> Dict[str, float]:
"""
Evaluate content coverage using ROUGE scores and entity overlap.
Based on: "A Critical Evaluation of Evaluations for Long-form QA"
[Xu et al., 2023]
"""
# Calculate ROUGE scores
rouge_scores = self.rouge_scorer.score(source_text, generated_text)
# Extract entities
source_entities = set(self._extract_entities(source_text))
generated_entities = set(self._extract_entities(generated_text))
# Calculate entity coverage
entity_recall = len(generated_entities.intersection(source_entities)) / len(source_entities) if source_entities else 0
return {
'rouge1': rouge_scores['rouge1'].fmeasure,
'rouge2': rouge_scores['rouge2'].fmeasure,
'rougeL': rouge_scores['rougeL'].fmeasure,
'entity_recall': entity_recall
}
def analyze_dialogue_structure(
self,
transcript: str
) -> Dict[str, float]:
"""
Analyze dialogue balance and structure.
Based on: "LongEval: Guidelines for Human Evaluation of Faithfulness
in Long-form Summarization" [Krishna et al., 2023]
"""
# Split into speaker turns
turns = self._split_into_turns(transcript)
# Calculate turn statistics
speaker_counts = {}
turn_lengths = []
for turn in turns:
speaker = turn['speaker']
text = turn['text']
speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
turn_lengths.append(len(text.split()))
# Calculate dialogue metrics
total_turns = len(turns)
turn_distribution = [count/total_turns for count in speaker_counts.values()]
return {
'speaker_entropy': entropy(turn_distribution),
'avg_turn_length': np.mean(turn_lengths),
'turn_length_variance': np.var(turn_lengths),
'interaction_score': self._calculate_interaction_score(turns)
}
def verify_citations(
self,
generated_text: str,
sources: List[Dict[str, str]]
) -> Dict[str, float]:
"""
Verify citation accuracy and coverage.
Based on: "Enabling Large Language Models to Generate Text with Citations"
[Gao et al., 2023]
"""
# Extract cited statements
citations = self._extract_citations(generated_text)
# Verify each citation
correct_citations = 0
for citation in citations:
statement = citation['statement']
source_id = citation['source_id']
source_text = next(
(s['text'] for s in sources if s['id'] == source_id),
None
)
if source_text and self._verify_statement(statement, source_text):
correct_citations += 1
return {
'citation_precision': correct_citations / len(citations) if citations else 0,
'citation_density': len(citations) / len(sent_tokenize(generated_text))
}
def measure_engagement(
self,
transcript: str
) -> Dict[str, float]:
"""
Measure engagement through linguistic features.
Based on: "What makes conversations engaging?" [Zhang et al., 2023]
"""
# Analyze question frequency
questions = len(re.findall(r'\?', transcript))
# Calculate sentiment variance
sentiments = self._analyze_sentiment_by_turn(transcript)
# Check linguistic diversity
vocab_richness = len(set(transcript.split())) / len(transcript.split())
return {
'question_density': questions / len(sent_tokenize(transcript)),
'sentiment_variance': np.var(sentiments),
'vocabulary_richness': vocab_richness,
'toxicity_score': self._measure_toxicity(transcript)
}
def _extract_entities(self, text: str) -> List[str]:
"""Extract named entities using spaCy."""
doc = self.nlp(text)
return [ent.text for ent in doc.ents]
def _split_into_turns(self, transcript: str) -> List[Dict[str, str]]:
"""Split transcript into speaker turns."""
turns = []
pattern = r'<(Person\d)>(.*?)</\1>'
matches = re.finditer(pattern, transcript, re.DOTALL)
for match in matches:
speaker, text = match.groups()
turns.append({
'speaker': speaker,
'text': text.strip()
})
return turns
def _calculate_interaction_score(self, turns: List[Dict[str, str]]) -> float:
"""Calculate interaction score based on turn-taking patterns."""
question_pattern = r'\?'
interaction_pairs = 0
for i in range(len(turns)-1):
current_turn = turns[i]['text']
next_turn = turns[i+1]['text']
# Check if current turn has question and next turn responds
if re.search(question_pattern, current_turn):
interaction_pairs += 1
return interaction_pairs / len(turns) if turns else 0
def _verify_statement(self, statement: str, source: str) -> bool:
"""Verify if statement is supported by source using BERTScore."""
precision, recall, f1 = score(
[statement],
[source],
lang='en'
)
return f1.mean().item() > 0.6 # Threshold based on empirical testing
def _analyze_sentiment_by_turn(self, transcript: str) -> List[float]:
"""Analyze sentiment for each turn."""
turns = self._split_into_turns(transcript)
sentiments = []
for turn in turns:
# Use toxicity model for sentiment approximation
inputs = self.toxicity_tokenizer(
turn['text'],
return_tensors="pt",
truncation=True
)
outputs = self.toxicity_model(**inputs)
sentiment = torch.sigmoid(outputs.logits).item()
sentiments.append(sentiment)
return sentiments
def _measure_toxicity(self, text: str) -> float:
"""Measure overall toxicity of the text."""
inputs = self.toxicity_tokenizer(
text,
return_tensors="pt",
truncation=True
)
outputs = self.toxicity_model(**inputs)
return torch.sigmoid(outputs.logits).item()
# Example usage
def main():
evaluator = AutomaticMetrics()
# Example transcript
transcript = """
<Person1>Can you explain how neural networks learn?</Person1>
<Person2>Neural networks learn through a process called backpropagation...</Person2>
<Person1>That's interesting! How does backpropagation work exactly?</Person1>
<Person2>Think of backpropagation as a way for the network to adjust its mistakes...</Person2>
"""
# Example sources
sources = [{
'id': '1',
'text': 'Neural networks learn through backpropagation, adjusting weights based on errors.'
}]
# Run evaluations
coverage_scores = evaluator.evaluate_content_coverage(
sources[0]['text'],
transcript
)
dialogue_scores = evaluator.analyze_dialogue_structure(transcript)
citation_scores = evaluator.verify_citations(transcript, sources)
engagement_scores = evaluator.measure_engagement(transcript)
print("Evaluation Results:")
print("Coverage:", coverage_scores)
print("Dialogue:", dialogue_scores)
print("Citations:", citation_scores)
print("Engagement:", engagement_scores)
if __name__ == "__main__":
main()
from typing import List, Dict, Any
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
from collections import Counter
class EngagementMetrics:
"""
Extended metrics for measuring conversation engagement based on:
Zhang et al. "Unveiling the Secrets of Engaging Conversations"
"""
def __init__(self):
self.emotion_model = AutoModelForSequenceClassification.from_pretrained(
"j-hartmann/emotion-english-distilroberta-base"
)
self.emotion_tokenizer = AutoTokenizer.from_pretrained(
"j-hartmann/emotion-english-distilroberta-base"
)
def evaluate_engagement(self, transcript: str) -> Dict[str, float]:
"""Calculate comprehensive engagement metrics."""
turns = self._split_into_turns(transcript)
return {
'turn_dynamics': self._measure_turn_dynamics(turns),
'emotional_resonance': self._measure_emotional_resonance(turns),
'content_depth': self._measure_content_depth(turns),
'interactive_flow': self._measure_interactive_flow(turns)
}
def _measure_turn_dynamics(self, turns: List[Dict]) -> Dict[str, float]:
"""
Measure turn-taking dynamics including:
- Response latency patterns
- Turn length variations
- Speaker balance
"""
turn_lengths = [len(turn['text'].split()) for turn in turns]
speaker_counts = Counter(turn['speaker'] for turn in turns)
return {
'turn_length_mean': np.mean(turn_lengths),
'turn_length_variance': np.var(turn_lengths),
'speaker_balance': min(speaker_counts.values()) / max(speaker_counts.values()),
'consistent_engagement': self._calculate_engagement_consistency(turn_lengths)
}
def _measure_emotional_resonance(self, turns: List[Dict]) -> Dict[str, float]:
"""
Analyze emotional aspects including:
- Emotion transitions
- Sentiment synchronization
- Emotional intensity
"""
emotions = []
for turn in turns:
inputs = self.emotion_tokenizer(
turn['text'],
return_tensors="pt",
truncation=True
)
outputs = self.emotion_model(**inputs)
emotion_probs = torch.softmax(outputs.logits, dim=1)
emotions.append(emotion_probs.detach().numpy())
return {
'emotion_coherence': self._calculate_emotion_coherence(emotions),
'sentiment_alignment': self._calculate_sentiment_alignment(turns),
'emotional_range': self._calculate_emotional_range(emotions)
}
def _measure_content_depth(self, turns: List[Dict]) -> Dict[str, float]:
"""
Evaluate content quality including:
- Topic development
- Information density
- Vocabulary sophistication
"""
all_text = " ".join(turn['text'] for turn in turns)
sentences = sent_tokenize(all_text)
return {
'vocab_diversity': len(set(all_text.split())) / len(all_text.split()),
'topic_coherence': self._calculate_topic_coherence(turns),
'information_density': self._calculate_information_density(sentences)
}
def _measure_interactive_flow(self, turns: List[Dict]) -> Dict[str, float]:
"""
Analyze interaction patterns including:
- Question-answer dynamics
- Follow-up behaviors
- Engagement signals
"""
return {
'question_answer_ratio': self._calculate_qa_ratio(turns),
'followup_rate': self._calculate_followup_rate(turns),
'engagement_signals': self._detect_engagement_signals(turns)
}
def _calculate_engagement_consistency(self, turn_lengths: List[int]) -> float:
"""Measure consistency of engagement through turn lengths."""
if not turn_lengths:
return 0.0
# Calculate moving average of turn lengths
window_size = 3
moving_avgs = []
for i in range(len(turn_lengths) - window_size + 1):
window = turn_lengths[i:i + window_size]
moving_avgs.append(np.mean(window))
# Calculate variance of moving averages
return 1 / (1 + np.var(moving_avgs))
def _calculate_emotion_coherence(self, emotions: List[np.ndarray]) -> float:
"""Calculate emotional coherence between turns."""
if len(emotions) < 2:
return 0.0
coherence_scores = []
for i in range(len(emotions) - 1):
similarity = np.dot(emotions[i][0], emotions[i+1][0])
coherence_scores.append(similarity)
return np.mean(coherence_scores)
def _calculate_topic_coherence(self, turns: List[Dict]) -> float:
"""Measure topic development and maintenance."""
# Simplified topic coherence using lexical overlap
words_by_turn = [set(turn['text'].lower().split()) for turn in turns]
coherence_scores = []
for i in range(len(words_by_turn) - 1):
overlap = len(words_by_turn[i] & words_by_turn[i+1])
union = len(words_by_turn[i] | words_by_turn[i+1])
coherence_scores.append(overlap / union if union > 0 else 0)
return np.mean(coherence_scores)
def _calculate_information_density(self, sentences: List[str]) -> float:
"""Calculate information density using sentence complexity."""
# Using sentence length and unique words as proxy for information density
densities = []
for sent in sentences:
words = sent.split()
unique_ratio = len(set(words)) / len(words) if words else 0
densities.append(len(words) * unique_ratio)
return np.mean(densities)
def _calculate_qa_ratio(self, turns: List[Dict]) -> float:
"""Calculate ratio of questions to answers."""
questions = sum(1 for turn in turns if '?' in turn['text'])
return questions / len(turns) if turns else 0
def _detect_engagement_signals(self, turns: List[Dict]) -> float:
"""Detect explicit engagement signals in conversation."""
engagement_markers = [
r'\b(interesting|fascinating|tell me more|i see|wow|really|oh)\b',
r'[!?]{2,}',
r'\b(agree|exactly|absolutely|definitely)\b'
]
total_signals = 0
for turn in turns:
for marker in engagement_markers:
total_signals += len(re.findall(marker, turn['text'].lower()))
return total_signals / len(turns) if turns else 0
def _split_into_turns(self, transcript: str) -> List[Dict[str, str]]:
"""Split transcript into turns."""
turns = []
pattern = r'<(Person\d)>(.*?)</\1>'
matches = re.finditer(pattern, transcript, re.DOTALL)
for match in matches:
speaker, text = match.groups()
turns.append({
'speaker': speaker,
'text': text.strip()
})
return turns
# Example usage
def main():
evaluator = EngagementMetrics()
transcript = """
<Person1>What's your take on artificial intelligence?</Person1>
<Person2>AI is fascinating! It's revolutionizing how we solve complex problems.</Person2>
<Person1>That's interesting! Could you give an example?</Person1>
<Person2>Absolutely! Take medical diagnosis for instance. AI can now detect patterns in medical images with remarkable accuracy!</Person2>
"""
results = evaluator.evaluate_engagement(transcript)
print("Engagement Metrics:")
for category, scores in results.items():
print(f"\n{category.replace('_', ' ').title()}:")
for metric, value in scores.items():
print(f" {metric}: {value:.3f}")
if __name__ == "__main__":
main()
LLM-as-a-judge sounds like an interesting, flexible and clean solution for this
Also, https://github.com/HLasse/TextDescriptives
for a metrics-based approach
"evals are surprisingly often all you need"
But here we are evaluating a pretty novel dimension:
How can we systematically quantify generated text/audio is engaging, follows a target configuration while delivering accurate and informative content?
This is perhaps the most challenging yet most pressing issue to solve.