Currently the formatter of error messages will truncate in a generic manner, but not with a user-friendly clean result.
Quick reproducer (though I don't see truncation here myself):
throw new RuntimeException()
In particular, with a rather long code example (below), I got the following screenshot: both the first word DEFAULT and the (shortened) package name dropped their first character and this appears sloppy (even though technically correct for how we are truncating messages).
Install the following packages via pip, following the instructions here: https://github.com/deephaven/deephaven.io/pull/541/files
pip install tensorflow
pip install tensorflow_hub
pip install sklearn
pip install spacy
pip install bs4
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import string
import re
import spacy
import deephaven.npy as inp
from deephaven.java_to_python import columnToNumpyArray
from deephaven import DynamicTableWriter, Types as dht
from deephaven.TableTools import timeTable
def cleanText(text):
#to lowercase
text = text.lower()
#correct spaces (e.g. "End sentence.Begin another" becomes "End sentence. Begin another")
text = re.sub(r'\.([a-zA-Z])', r'. \1', text)
text = re.sub(r'\?([a-zA-Z])', r'. \1', text)
text = re.sub(r'\!([a-zA-Z])', r'. \1', text)
#replace q1,2,3,4 with q
text = re.sub("q[1-4]", "q", text)
#replace 20xx with 2000
text = re.sub("20[0-2][0-9]", "2000", text)
#lemmatize and remove stop words and punctuation
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
lemmatizedText = ""
for token in doc:
if not token.is_stop and not token.is_punct:
lemma = token.lemma_
if lemma == "-PRON-":
lemma = "it"
lemmatizedText += (lemma + " ")
text = lemmatizedText
return text
def shuffleTable(unshuffledTable):
return unshuffledTable.update("__r=Math.random()").sort("__r").dropColumns("__r")
def centroid(trainTextVectorized, trainLabels, testTextVectorized):
nc = NearestCentroid(metric='manhattan')
nc.fit(trainTextVectorized, trainLabels)
return nc.predict(testTextVectorized)
def naiveBayes(trainTextVectorized, trainLabels, testTextVectorized):
nb = BernoulliNB(alpha=1)
nb.fit(trainTextVectorized, trainLabels)
return nb.predict(testTextVectorized)
def preTrainedEmbedding(trainText, trainLabels, evalText, valSize, trainEmb=True):
#initialize training, validation, and testing data
valText = trainText[-1*valSize:]
valLabels = trainLabels[-1*valSize:]
trainText = trainText[:-1*valSize]
trainLabels = trainLabels[:-1*valSize]
#create and run model
hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1", output_shape=[20],
input_shape=[], dtype=tf.string, trainable=trainEmb)
model = keras.Sequential(name="mymodel")
model.add(hub_layer)
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
history = model.fit(trainText,
trainLabels,
epochs=40,
batch_size=4,
validation_data=(valText, valLabels),
verbose=0)
model.predict(evalText, verbose=1)
def predict(text, model):
if model == 'c' or model == 'nb':
textVectorized = vectorizer.transform([text])
if model == 'c':
return int(centroid(trainTextVectorized, trainLabels, textVectorized)[0])
else:
return int(naiveBayes(trainTextVectorized, trainLabels, textVectorized)[0])
elif model == 'e':
return int(preTrainedEmbedding(trainText, trainLabels, text, 10))
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import string
from deephaven.TableTools import emptyTable
from deephaven.conversion_utils import convertToJavaArray
#gets only those xml items which represent earnings call transcripts
def getEarningsCalls(items):
return [item for item in items if item.title.text.split()[-3:] == ["Earnings", "Call", "Transcript"]]
#gets the symbol and quarter/year out of an article's header
def parseHeader(header):
leftParenIdx = header.rindex("(")
rightParenIdx = header.rindex(")")
symbol = header[leftParenIdx+1:rightParenIdx]
quarterIdx = re.search("Q[1-4] 20[0-2][0-9]", header).start()
quarter = header[quarterIdx:quarterIdx+7]
return (symbol, quarter)
#find the index of the first paragraph that is equal to an item from the search list
#necessary for some functions below
def findIdx(paragraphs, searchList):
idx = 0
for paragraph in paragraphs:
for title in searchList:
if paragraph.text.lower()[:len(title)] == title:
return idx
idx += 1
return idx
#gets the names of all the company participants on the call
#this is useful in other functions below
def getNames(paragraphs):
#find the indices of the company participants and conference call participants roll-call sections
companyList = ["company participants", "corporate participants", "executives", "company representatives"]
confList = ["conference call participants", "analysts"]
startIdx = findIdx(paragraphs, companyList)
endIdx = findIdx(paragraphs, confList)
#record the name of each company participant
idx = startIdx + 1
names = []
while idx < endIdx:
paragraph = paragraphs[idx]
text = paragraph.text.split()
if len(text) < 2:
break
names.append(text[0] + " " + text[1])
idx += 1
return names
#removes the roll-call, operator announcement, and q&a sections of the call
def truncate(paragraphs, names):
#find the indices of the operator and q&a sections
operatorIdx = qaIdx = 0
for paragraph in paragraphs:
if paragraph.text.lower()[:len("operator")] == "operator":
break
operatorIdx += 1
for paragraph in paragraphs:
if "id" in paragraph.attrs.keys() and paragraph["id"].lower()[:len("question-answer-session")] == "question-answer-session":
break
qaIdx += 1
#if there is an operator section, get the section between it and the q&a
if operatorIdx < qaIdx:
paragraphs = paragraphs[operatorIdx+1:qaIdx-1]
#if there isn't an operator section, remove the company participant roll-call
#this is necessary for the next step
else:
confList = ["conference call participants", "analysts"]
confIdx = findIdx(paragraphs, confList)
paragraphs = paragraphs[confIdx:qaIdx-1]
#find the index of the first company participant's speaking section
#this represents the start of either the safe-harbor statement or the CEO presentation
nameIdx = 0
for paragraph in paragraphs:
text = paragraph.text.split()
if len(text) < 2:
break
name = text[0] + " " + text[1]
if name in names:
break
nameIdx += 1
#remove everything before the first company participant's speaking section
# print("op:%d\nqa:%d\nname:%d" % (operatorIdx, qaIdx, nameIdx))
return paragraphs[nameIdx:]
#check if the call has a safe-harbor section
def hasSafeHarborStatement(paragraphs):
phrases = ["10-K", "forward-looking statements", "forward-looking information", "non-GAAP"]
for paragraph in paragraphs:
for phrase in phrases:
if phrase in paragraph.text:
return True
return False
#remove the call's safe-harbor section with the assumption that it exists
def removeSafeHarborStatement(paragraphs, names):
#find the indices of the first two company participant speaking sections
#the first company speaker always says the safe-harbor statement, so his/her section must be removed
i = startIdx = endIdx = 0
first = True
for paragraph in paragraphs:
text = paragraph.text.split()
if text[0].lower() == "presentation":
i += 1
continue
name = text[0] + " " + text[1]
if name in names:
if first:
#this is the first speaker's index
startIdx = i
first = False
else:
#this is the second speaker's index
endIdx = i
break
i += 1
#remove the section between the two indices, i.e. the first speaker's section
return paragraphs[:startIdx] + paragraphs[endIdx+1:]
#removes all company participant names/paragraphs, as each name has its own paragraph in the call
def removeNames(paragraphs, names):
return [paragraph for paragraph in paragraphs if not paragraph.text in names]
#convert a list of paragraphs into a single text string
def collate(paragraphs):
s = ""
for paragraph in paragraphs:
s += paragraph.text
return s
def runRSS():
#get the rss feed
feed = requests.get("https://seekingalpha.com/sector/transcripts.xml").text
soup = BeautifulSoup(feed, "xml")
items = soup.find_all("item")
items = getEarningsCalls(items)
links = [item.link.text for item in items]
#these store the data for articles where access is granted
texts = []
timestamps = []
symbols = []
quarters = []
for link in links:
#get the transcript article
source = requests.get(link).text
soup = BeautifulSoup(source, "lxml")
try:
#find the header, timestamp, and paragraphs of the article
article = soup.find("article")
header = article.header.find("div", id="a-hd").h1.text
timestamp = article.header.find("div", id="a-hd").find("div", class_="a-info clearfix").time["content"]
paragraphs = article.find("div", id="a-cont").find("div", id="a-body").find_all("p")
#get symbol and quarter from the header
symbol, quarter = parseHeader(header)
#clean and collate the paragraphs
names = getNames(paragraphs)
paragraphs = truncate(paragraphs, names)
if hasSafeHarborStatement(paragraphs):
paragraphs = removeSafeHarborStatement(paragraphs, names)
paragraphs = removeNames(paragraphs, names)
text = collate(paragraphs)
#store collected data
texts.append(text)
timestamps.append(timestamp)
symbols.append(symbol)
quarters.append(quarter)
except:
# either bad article or access denied
pass
if len(texts) == 0:
return False
texts = convertToJavaArray(texts)
timestamps = convertToJavaArray(timestamps)
symbols = convertToJavaArray(symbols)
quarters = convertToJavaArray(quarters)
values = jpy.array("java.lang.String", 4)
symCol = columnToNumpyArray(calls, "Sym")
quarterCol = columnToNumpyArray(calls, "Quarter")
containsNewCall = False
for i in range(len(texts)):
if symbols[i] not in symCol and quarters[i] not in quarterCol:
containsNewCall = True
values[0] = texts[i]
values[1] = timestamps[i]
values[2] = symbols[i]
values[3] = quarters[i]
tw.logRow(values)
return containsNewCall
DynamicTableWriter = jpy.get_type("io.deephaven.db.v2.utils.DynamicTableWriter")
#trainData = db.t("Noor", "trainingData")\
# .moveUpColumns("Sym", "Quarter", "Label")
trainData = shuffleTable(trainData)
trainText = columnToNumpyArray(trainData, "Text")
trainLabels = inp.numpy_slice(trainData.view("Label"), 0, trainData.size(), dtype=np.int32)
trainLabels = np.reshape(trainLabels, -1)
vectorizer = TfidfVectorizer(max_features=1000, ngram_range = (1,1), norm='l1')
trainTextVectorized = vectorizer.fit_transform(trainText)
cols = ["Text", "RSSTimestamp", "Sym", "Quarter"]
types = [dht.string, dht.string, dht.string, dht.string,]
tw = DynamicTableWriter(cols, types)
calls = tw.getTable()\
.firstBy("Sym", "Quarter")\
.update("Text = (String)cleanText.call(Text)", "PredictedLabel = (int)predict.call(Text, `c`)", "PredictedLabel = PredictedLabel==0 ? -1 : PredictedLabel")\
.moveUpColumns("Sym", "Quarter", "RSSTimestamp", "PredictedLabel")
tt = timeTable("'00:01:00'")\
.sortDescending("Timestamp")\
.update("ContainedNewCalls=(boolean)runRSS.call()")
callsPre = calls.view("Sym", "Date=convertDate(RSSTimestamp.substring(0,10))", "PredictedLabel")\
.preemptiveUpdatesTable(2*60*1000)
Currently the formatter of error messages will truncate in a generic manner, but not with a user-friendly clean result.
Quick reproducer (though I don't see truncation here myself):
throw new RuntimeException()
In particular, with a rather long code example (below), I got the following screenshot: both the first word
DEFAULT
and the (shortened) package name dropped their first character and this appears sloppy (even though technically correct for how we are truncating messages).The longer code example for reference (Note that the following extra setup is needed to run: Put trainData.csv into/data
https://drive.google.com/file/d/1ups4QERtunN1iJUjwaE0T_DqzONInFfO/view?usp=sharing
Install the following packages via pip, following the instructions here: https://github.com/deephaven/deephaven.io/pull/541/files pip install tensorflow pip install tensorflow_hub pip install sklearn pip install spacy pip install bs4