Open vmarkovtsev opened 5 years ago
List of TODO
s:
--
./lookout/style/format/analyzer.py- "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
./lookout/style/format/analyzer.py- for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
./lookout/style/format/analyzer.py- submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
./lookout/style/format/analyzer.py: # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
./lookout/style/format/analyzer.py- # throw away imprecise classes
./lookout/style/format/analyzer.py- if trainable_rules.rules.rules:
./lookout/style/format/analyzer.py- model[language] = trainable_rules.rules
--
./lookout/style/format/classes.py-"""Predicted class definitions."""
./lookout/style/format/classes.py:# TODO(zurk): refactor CLS related code into one class. Related issue:
./lookout/style/format/classes.py-# https://github.com/src-d/style-analyzer/issues/286
./lookout/style/format/classes.py-
./lookout/style/format/classes.py-CLS_SPACE = "<space>"
--
./lookout/style/format/benchmarks/evaluate_smoke.py- else:
./lookout/style/format/benchmarks/evaluate_smoke.py- detected_wrong_fix += 1
./lookout/style/format/benchmarks/evaluate_smoke.py-
./lookout/style/format/benchmarks/evaluate_smoke.py: # TODO (zurk): Add proper class for benchmark metrics
./lookout/style/format/benchmarks/evaluate_smoke.py- # https://github.com/src-d/style-analyzer/issues/333
./lookout/style/format/benchmarks/evaluate_smoke.py- return misdetection, undetected, detected_wrong_fix, detected_correct_fix
./lookout/style/format/benchmarks/evaluate_smoke.py-
--
./lookout/style/format/benchmarks/evaluate_smoke.py- :return: A dictionary with losses and predicted code for "global" and "local" indentation \
./lookout/style/format/benchmarks/evaluate_smoke.py- strategies.
./lookout/style/format/benchmarks/evaluate_smoke.py- """
./lookout/style/format/benchmarks/evaluate_smoke.py: # TODO (zurk): make optional arguments non-optional.
./lookout/style/format/benchmarks/evaluate_smoke.py- if (vnodes is None) ^ (fe is None):
./lookout/style/format/benchmarks/evaluate_smoke.py- raise ValueError("vnodes and fe should be both None or not None.")
./lookout/style/format/benchmarks/evaluate_smoke.py- losses = {}
--
./lookout/style/format/benchmarks/generate_smoke.py- writer = csv.DictWriter(index_file, fieldnames=["repo", "style", "from", "to"])
./lookout/style/format/benchmarks/generate_smoke.py- writer.writeheader()
./lookout/style/format/benchmarks/generate_smoke.py- for style_name, (pattern, repl) in js_format_rules.items():
./lookout/style/format/benchmarks/generate_smoke.py: # TODO(zurk): handle custom modification functions as well as regexp.
./lookout/style/format/benchmarks/generate_smoke.py- pattern = re.compile(pattern)
./lookout/style/format/benchmarks/generate_smoke.py- for repopath in repopaths:
./lookout/style/format/benchmarks/generate_smoke.py- repo = Repo(str(repopath))
--
./lookout/style/format/benchmarks/general_report.py- to_show = to_show[:max_files]
./lookout/style/format/benchmarks/general_report.py-
./lookout/style/format/benchmarks/general_report.py- template = _load_jinja2_template("quality_report.md.jinja2")
./lookout/style/format/benchmarks/general_report.py: # TODO(vmarkovtsev): move all the logic inside the template
./lookout/style/format/benchmarks/general_report.py- res = template.render(language=language, ptr=ptr, conf_mat=mat, target_names=target_names,
./lookout/style/format/benchmarks/general_report.py- files=to_show, cl_report=c_sorted, cl_report_full=c_report_full,
./lookout/style/format/benchmarks/general_report.py- ppcr=ppcr)
--
./lookout/style/format/benchmarks/top_repos_quality.py-
./lookout/style/format/benchmarks/top_repos_quality.py-from lookout.style.format.benchmarks.general_report import QualityReportAnalyzer
./lookout/style/format/benchmarks/top_repos_quality.py-
./lookout/style/format/benchmarks/top_repos_quality.py:# TODO(zurk): Move REPOSITORIES to ./benchmarks/data/default_repository_list.csv
./lookout/style/format/benchmarks/top_repos_quality.py-# format: "repository to-commit from-commit"
./lookout/style/format/benchmarks/top_repos_quality.py-REPOSITORIES = [
./lookout/style/format/benchmarks/top_repos_quality.py- "https://github.com/30-seconds/30-seconds-of-code 3a122c9cfcbdc091227879a06a32bc67ccd0d35d c8c60895e80b8bc90583502accdaa339b794609c", # noqa: E501
--
./lookout/style/format/benchmarks/top_repos_quality.py- "https://github.com/vuejs/vue-cli 2024f4e52097bed96b527d2e519dccba334f434b 85e4f9839ba88d1283b10bb3112582792b8dac6e", # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py- "https://github.com/meteor/meteor c3309b123a7220ac24cbe73661184ee946bca01f 62fa9927ce34cff064cc3991439553e7c52b5258", # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py- "https://github.com/webpack/webpack babe736cfa1ef7e8014ed32ba4a4ec38049dce14 3e74cb428af04eedac60ae13d2420d2b5bd3bde1", # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py: # TODO: add after bblfsh python client v3 is released and we use it
./lookout/style/format/benchmarks/top_repos_quality.py- # "https://github.com/vuejs/vue b7105ae8c9093e36ec89a470caa3b78bda3ef467 db1d0474997e9e60b8b0d39a3b7c2af55cfd0d4a", # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py- # "https://github.com/vuejs/vuex 2e62705d4bce4ebcb8eca23df8c7b849125fc565 1ac16a95c574f6b1386016fb6d4f00cfd2ee1d60", # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py- "https://github.com/segmentio/evergreen ba22d511dad83c072842e47801ef42697d142f7c 1030eca5da38dce4e5047c23a3ea7fc0c246b8ce", # noqa: E501
--
./lookout/style/format/benchmarks/top_repos_quality.py- if os.path.exists(repository):
./lookout/style/format/benchmarks/top_repos_quality.py- return repository
./lookout/style/format/benchmarks/top_repos_quality.py- # clone repository
./lookout/style/format/benchmarks/top_repos_quality.py: # TODO: use dulwich in the future
./lookout/style/format/benchmarks/top_repos_quality.py- git_dir = os.path.join(storage_dir, get_repo_name(repository)) # location for code
./lookout/style/format/benchmarks/top_repos_quality.py- cmd = "git clone --single-branch --branch master %s %s" % (repository, git_dir)
./lookout/style/format/benchmarks/top_repos_quality.py- subprocess.check_call(cmd.split())
--
./lookout/style/format/benchmarks/top_repos_quality.py-def calc_weighted_avg(arr: Sequence[Sequence], col: int, weight_col: int = 5) -> float:
./lookout/style/format/benchmarks/top_repos_quality.py- """Calculate average value in `col` weighted by column `weight_col`."""
./lookout/style/format/benchmarks/top_repos_quality.py- numerator, denominator = 0, 0
./lookout/style/format/benchmarks/top_repos_quality.py: # TODO: switch to numpy arrays
./lookout/style/format/benchmarks/top_repos_quality.py- for row in arr:
./lookout/style/format/benchmarks/top_repos_quality.py- numerator += row[col] * row[weight_col]
./lookout/style/format/benchmarks/top_repos_quality.py- denominator += row[weight_col]
--
./lookout/style/format/benchmarks/top_repos_quality.py-def calc_avg(arr: Sequence[Sequence], col: int) -> float:
./lookout/style/format/benchmarks/top_repos_quality.py- """Calculate average value in `col`."""
./lookout/style/format/benchmarks/top_repos_quality.py- numerator, denominator = 0, 0
./lookout/style/format/benchmarks/top_repos_quality.py: # TODO: switch to numpy arrays
./lookout/style/format/benchmarks/top_repos_quality.py- for row in arr:
./lookout/style/format/benchmarks/top_repos_quality.py- numerator += row[col]
./lookout/style/format/benchmarks/top_repos_quality.py- denominator += 1
--
./lookout/style/format/benchmarks/top_repos_quality.py-def _get_model_summary(report: str) -> (int, float):
./lookout/style/format/benchmarks/top_repos_quality.py- """Extract model summary - number of rules and avg. len."""
./lookout/style/format/benchmarks/top_repos_quality.py- data = _get_json_data(report)
./lookout/style/format/benchmarks/top_repos_quality.py: # TODO(vmarkovtsev): address this embarrasing hardcode
./lookout/style/format/benchmarks/top_repos_quality.py- return data["javascript"]["num_rules"], data["javascript"]["avg_rule_len"]
./lookout/style/format/benchmarks/top_repos_quality.py-
./lookout/style/format/benchmarks/top_repos_quality.py-
and FIXME
./lookout/style/format/benchmarks/general_report.py- vnodes = chain.from_iterable(fix.file_vnodes for fix in fixes)
./lookout/style/format/benchmarks/general_report.py- ys = numpy.hstack(fix.y for fix in fixes)
./lookout/style/format/benchmarks/general_report.py- y_pred_pure = numpy.hstack(fix.y_pred_pure for fix in fixes)
./lookout/style/format/benchmarks/general_report.py: # FIXME(vmarkovtsev): we are taking the first fix here which does not work for >1 language
./lookout/style/format/benchmarks/general_report.py- return generate_quality_report(
./lookout/style/format/benchmarks/general_report.py- fixes[0].language, self.model.ptr, vnodes, ys, y_pred_pure, self.config["max_files"],
./lookout/style/format/benchmarks/general_report.py- fixes[0].feature_extractor.composite_class_representations)
cmd to collect it
grep -r --exclude=\*.{xml,js,coffee,md,map,ts,in,html,yml,npmignore,ipynb,go,proto} "FIXME" -A3 -B3 .
grep -r --exclude=\*.{xml,js,coffee,md,map,ts,in,html,yml,npmignore,ipynb,go,proto} "TODO" -A3 -B3 .
# ./lookout/style/format/analyzer.py:
"\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
# TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
# throw away imprecise classes
if trainable_rules.rules.rules:
model[language] = trainable_rules.rules
should be covered by https://github.com/src-d/style-analyzer/pull/534
# ./lookout/style/format/classes.py-
"""Predicted class definitions."""
# TODO(zurk): refactor CLS related code into one class. Related issue:
# https://github.com/src-d/style-analyzer/issues/286
CLS_SPACE = "<space>"
related issue: https://github.com/src-d/style-analyzer/issues/286
# ./lookout/style/format/benchmarks/evaluate_smoke.py-
else:
detected_wrong_fix += 1
# TODO (zurk): Add proper class for benchmark metrics
# https://github.com/src-d/style-analyzer/issues/333
return misdetection, undetected, detected_wrong_fix, detected_correct_fix
related issue: Rename benchmark metric names
#./lookout/style/format/benchmarks/evaluate_smoke.py-
:return: A dictionary with losses and predicted code for "global" and "local" indentation \
strategies.
"""
# TODO (zurk): make optional arguments non-optional.
if (vnodes is None) ^ (fe is None):
raise ValueError("vnodes and fe should be both None or not None.")
losses = {}
@zurk , could you explain what should be done here?
# ./lookout/style/format/benchmarks/generate_smoke.py-
writer = csv.DictWriter(index_file, fieldnames=["repo", "style", "from", "to"])
writer.writeheader()
for style_name, (pattern, repl) in js_format_rules.items():
# TODO(zurk): handle custom modification functions as well as regexp.
pattern = re.compile(pattern)
for repopath in repopaths:
repo = Repo(str(repopath))
could be done like this
if isinstance(pattern, str):
pattern = re.compile(pattern)
pattern = lambda repl, code: re.sub(pattern, repl, code)
....
new_code = pattern(repl, code)
filepath.write_text(new_code)
#./lookout/style/format/benchmarks/general_report.py-
to_show = to_show[:max_files]
template = _load_jinja2_template("quality_report.md.jinja2")
# TODO(vmarkovtsev): move all the logic inside the template
res = template.render(language=language, ptr=ptr, conf_mat=mat, target_names=target_names,
files=to_show, cl_report=c_sorted, cl_report_full=c_report_full,
ppcr=ppcr)
it's possible to create dictionary and send it to render
- is it your intention, @vmarkovtsev ?
#./lookout/style/format/benchmarks/top_repos_quality.py-
from lookout.style.format.benchmarks.general_report import QualityReportAnalyzer
# TODO(zurk): Move REPOSITORIES to ./benchmarks/data/default_repository_list.csv
# format: "repository to-commit from-commit"
REPOSITORIES = [
"https://github.com/30-seconds/30-seconds-of-code 3a122c9cfcbdc091227879a06a32bc67ccd0d35d c8c60895e80b8bc90583502accdaa339b794609c", # noqa: E501
related PR: Add input file for quality report
#./lookout/style/format/benchmarks/top_repos_quality.py-
if os.path.exists(repository):
return repository
# clone repository
# TODO: use dulwich in the future
git_dir = os.path.join(storage_dir, get_repo_name(repository)) # location for code
cmd = "git clone --single-branch --branch master %s %s" % (repository, git_dir)
subprocess.check_call(cmd.split())
example how to do it and how it should look in the end
from dulwich import porcelain
...
porcelain.clone(repository, git_dir)
Related PR: https://github.com/src-d/style-analyzer/pull/535
#./lookout/style/format/benchmarks/top_repos_quality.py-
def calc_weighted_avg(arr: Sequence[Sequence], col: int, weight_col: int = 5) -> float:
"""Calculate average value in `col` weighted by column `weight_col`."""
numerator, denominator = 0, 0
# TODO: switch to numpy arrays
for row in arr:
numerator += row[col] * row[weight_col]
denominator += row[weight_col]
solution
arr = numpy.array(arr)
return (arr[:, col] * arr[:, weight_col]).sum() / arr[:, weight_col].sum()
Related PR: https://github.com/src-d/style-analyzer/pull/535
#./lookout/style/format/benchmarks/top_repos_quality.py-
def calc_avg(arr: Sequence[Sequence], col: int) -> float:
"""Calculate average value in `col`."""
numerator, denominator = 0, 0
# TODO: switch to numpy arrays
for row in arr:
numerator += row[col]
denominator += 1
solution
arr = numpy.array(arr)
return arr[:, col].sum() / arr.shape[0]
Related PR: https://github.com/src-d/style-analyzer/pull/535
#./lookout/style/format/benchmarks/top_repos_quality.py-
def _get_model_summary(report: str) -> (int, float):
"""Extract model summary - number of rules and avg. len."""
data = _get_json_data(report)
# TODO(vmarkovtsev): address this embarrasing hardcode
return data["javascript"]["num_rules"], data["javascript"]["avg_rule_len"]
Solution: iterate through available languages and return these parameters for each language
#./lookout/style/format/benchmarks/general_report.py-
vnodes = chain.from_iterable(fix.file_vnodes for fix in fixes)
ys = numpy.hstack(fix.y for fix in fixes)
y_pred_pure = numpy.hstack(fix.y_pred_pure for fix in fixes)
# FIXME(vmarkovtsev): we are taking the first fix here which does not work for >1 language
return generate_quality_report(
fixes[0].language, self.model.ptr, vnodes, ys, y_pred_pure, self.config["max_files"],
fixes[0].feature_extractor.composite_class_representations)
Solution: generate quality report per language - requires refactoring of reports
@vmarkovtsev let's close this one. This issue related to old refactoring phase. Some of the TODOs were solved, but we have a bunch of new TODOs already, but it is another story.
Convert to issues as needed.