Review all TODOs and FIXMEs

vmarkovtsev commented 5 years ago

Convert to issues as needed.

EgorBu commented 5 years ago

List of TODOs:

--
./lookout/style/format/analyzer.py-                "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
./lookout/style/format/analyzer.py-                            for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
./lookout/style/format/analyzer.py-            submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
./lookout/style/format/analyzer.py:            # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
./lookout/style/format/analyzer.py-            # throw away imprecise classes
./lookout/style/format/analyzer.py-            if trainable_rules.rules.rules:
./lookout/style/format/analyzer.py-                model[language] = trainable_rules.rules
--
./lookout/style/format/classes.py-"""Predicted class definitions."""
./lookout/style/format/classes.py:# TODO(zurk): refactor CLS related code into one class. Related issue:
./lookout/style/format/classes.py-# https://github.com/src-d/style-analyzer/issues/286
./lookout/style/format/classes.py-
./lookout/style/format/classes.py-CLS_SPACE = "<space>"
--
./lookout/style/format/benchmarks/evaluate_smoke.py-        else:
./lookout/style/format/benchmarks/evaluate_smoke.py-            detected_wrong_fix += 1
./lookout/style/format/benchmarks/evaluate_smoke.py-
./lookout/style/format/benchmarks/evaluate_smoke.py:    # TODO (zurk): Add proper class for benchmark metrics
./lookout/style/format/benchmarks/evaluate_smoke.py-    # https://github.com/src-d/style-analyzer/issues/333
./lookout/style/format/benchmarks/evaluate_smoke.py-    return misdetection, undetected, detected_wrong_fix, detected_correct_fix
./lookout/style/format/benchmarks/evaluate_smoke.py-
--
./lookout/style/format/benchmarks/evaluate_smoke.py-    :return: A dictionary with losses and predicted code for "global" and "local" indentation \
./lookout/style/format/benchmarks/evaluate_smoke.py-             strategies.
./lookout/style/format/benchmarks/evaluate_smoke.py-    """
./lookout/style/format/benchmarks/evaluate_smoke.py:    # TODO (zurk): make optional arguments non-optional.
./lookout/style/format/benchmarks/evaluate_smoke.py-    if (vnodes is None) ^ (fe is None):
./lookout/style/format/benchmarks/evaluate_smoke.py-        raise ValueError("vnodes and fe should be both None or not None.")
./lookout/style/format/benchmarks/evaluate_smoke.py-    losses = {}
--
./lookout/style/format/benchmarks/generate_smoke.py-        writer = csv.DictWriter(index_file, fieldnames=["repo", "style", "from", "to"])
./lookout/style/format/benchmarks/generate_smoke.py-        writer.writeheader()
./lookout/style/format/benchmarks/generate_smoke.py-        for style_name, (pattern, repl) in js_format_rules.items():
./lookout/style/format/benchmarks/generate_smoke.py:            # TODO(zurk): handle custom modification functions as well as regexp.
./lookout/style/format/benchmarks/generate_smoke.py-            pattern = re.compile(pattern)
./lookout/style/format/benchmarks/generate_smoke.py-            for repopath in repopaths:
./lookout/style/format/benchmarks/generate_smoke.py-                repo = Repo(str(repopath))
--
./lookout/style/format/benchmarks/general_report.py-        to_show = to_show[:max_files]
./lookout/style/format/benchmarks/general_report.py-
./lookout/style/format/benchmarks/general_report.py-    template = _load_jinja2_template("quality_report.md.jinja2")
./lookout/style/format/benchmarks/general_report.py:    # TODO(vmarkovtsev): move all the logic inside the template
./lookout/style/format/benchmarks/general_report.py-    res = template.render(language=language, ptr=ptr, conf_mat=mat, target_names=target_names,
./lookout/style/format/benchmarks/general_report.py-                          files=to_show, cl_report=c_sorted, cl_report_full=c_report_full,
./lookout/style/format/benchmarks/general_report.py-                          ppcr=ppcr)
--
./lookout/style/format/benchmarks/top_repos_quality.py-
./lookout/style/format/benchmarks/top_repos_quality.py-from lookout.style.format.benchmarks.general_report import QualityReportAnalyzer
./lookout/style/format/benchmarks/top_repos_quality.py-
./lookout/style/format/benchmarks/top_repos_quality.py:# TODO(zurk): Move REPOSITORIES to ./benchmarks/data/default_repository_list.csv
./lookout/style/format/benchmarks/top_repos_quality.py-# format: "repository to-commit from-commit"
./lookout/style/format/benchmarks/top_repos_quality.py-REPOSITORIES = [
./lookout/style/format/benchmarks/top_repos_quality.py-    "https://github.com/30-seconds/30-seconds-of-code 3a122c9cfcbdc091227879a06a32bc67ccd0d35d c8c60895e80b8bc90583502accdaa339b794609c",  # noqa: E501
--
./lookout/style/format/benchmarks/top_repos_quality.py-    "https://github.com/vuejs/vue-cli 2024f4e52097bed96b527d2e519dccba334f434b 85e4f9839ba88d1283b10bb3112582792b8dac6e",  # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py-    "https://github.com/meteor/meteor c3309b123a7220ac24cbe73661184ee946bca01f 62fa9927ce34cff064cc3991439553e7c52b5258",  # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py-    "https://github.com/webpack/webpack babe736cfa1ef7e8014ed32ba4a4ec38049dce14 3e74cb428af04eedac60ae13d2420d2b5bd3bde1",  # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py:    # TODO: add after bblfsh python client v3 is released and we use it
./lookout/style/format/benchmarks/top_repos_quality.py-    # "https://github.com/vuejs/vue b7105ae8c9093e36ec89a470caa3b78bda3ef467 db1d0474997e9e60b8b0d39a3b7c2af55cfd0d4a",  # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py-    # "https://github.com/vuejs/vuex 2e62705d4bce4ebcb8eca23df8c7b849125fc565 1ac16a95c574f6b1386016fb6d4f00cfd2ee1d60",  # noqa: E501
./lookout/style/format/benchmarks/top_repos_quality.py-    "https://github.com/segmentio/evergreen ba22d511dad83c072842e47801ef42697d142f7c 1030eca5da38dce4e5047c23a3ea7fc0c246b8ce",  # noqa: E501
--
./lookout/style/format/benchmarks/top_repos_quality.py-    if os.path.exists(repository):
./lookout/style/format/benchmarks/top_repos_quality.py-        return repository
./lookout/style/format/benchmarks/top_repos_quality.py-    # clone repository
./lookout/style/format/benchmarks/top_repos_quality.py:    # TODO: use dulwich in the future
./lookout/style/format/benchmarks/top_repos_quality.py-    git_dir = os.path.join(storage_dir, get_repo_name(repository))  # location for code
./lookout/style/format/benchmarks/top_repos_quality.py-    cmd = "git clone --single-branch --branch master %s %s" % (repository, git_dir)
./lookout/style/format/benchmarks/top_repos_quality.py-    subprocess.check_call(cmd.split())
--
./lookout/style/format/benchmarks/top_repos_quality.py-def calc_weighted_avg(arr: Sequence[Sequence], col: int, weight_col: int = 5) -> float:
./lookout/style/format/benchmarks/top_repos_quality.py-    """Calculate average value in `col` weighted by column `weight_col`."""
./lookout/style/format/benchmarks/top_repos_quality.py-    numerator, denominator = 0, 0
./lookout/style/format/benchmarks/top_repos_quality.py:    # TODO: switch to numpy arrays
./lookout/style/format/benchmarks/top_repos_quality.py-    for row in arr:
./lookout/style/format/benchmarks/top_repos_quality.py-        numerator += row[col] * row[weight_col]
./lookout/style/format/benchmarks/top_repos_quality.py-        denominator += row[weight_col]
--
./lookout/style/format/benchmarks/top_repos_quality.py-def calc_avg(arr: Sequence[Sequence], col: int) -> float:
./lookout/style/format/benchmarks/top_repos_quality.py-    """Calculate average value in `col`."""
./lookout/style/format/benchmarks/top_repos_quality.py-    numerator, denominator = 0, 0
./lookout/style/format/benchmarks/top_repos_quality.py:    # TODO: switch to numpy arrays
./lookout/style/format/benchmarks/top_repos_quality.py-    for row in arr:
./lookout/style/format/benchmarks/top_repos_quality.py-        numerator += row[col]
./lookout/style/format/benchmarks/top_repos_quality.py-        denominator += 1
--
./lookout/style/format/benchmarks/top_repos_quality.py-def _get_model_summary(report: str) -> (int, float):
./lookout/style/format/benchmarks/top_repos_quality.py-    """Extract model summary - number of rules and avg. len."""
./lookout/style/format/benchmarks/top_repos_quality.py-    data = _get_json_data(report)
./lookout/style/format/benchmarks/top_repos_quality.py:    # TODO(vmarkovtsev): address this embarrasing hardcode
./lookout/style/format/benchmarks/top_repos_quality.py-    return data["javascript"]["num_rules"], data["javascript"]["avg_rule_len"]
./lookout/style/format/benchmarks/top_repos_quality.py-
./lookout/style/format/benchmarks/top_repos_quality.py-

and FIXME

./lookout/style/format/benchmarks/general_report.py-        vnodes = chain.from_iterable(fix.file_vnodes for fix in fixes)
./lookout/style/format/benchmarks/general_report.py-        ys = numpy.hstack(fix.y for fix in fixes)
./lookout/style/format/benchmarks/general_report.py-        y_pred_pure = numpy.hstack(fix.y_pred_pure for fix in fixes)
./lookout/style/format/benchmarks/general_report.py:        # FIXME(vmarkovtsev): we are taking the first fix here which does not work for >1 language
./lookout/style/format/benchmarks/general_report.py-        return generate_quality_report(
./lookout/style/format/benchmarks/general_report.py-            fixes[0].language, self.model.ptr, vnodes, ys, y_pred_pure, self.config["max_files"],
./lookout/style/format/benchmarks/general_report.py-            fixes[0].feature_extractor.composite_class_representations)

cmd to collect it grep -r --exclude=\*.{xml,js,coffee,md,map,ts,in,html,yml,npmignore,ipynb,go,proto} "FIXME" -A3 -B3 . grep -r --exclude=\*.{xml,js,coffee,md,map,ts,in,html,yml,npmignore,ipynb,go,proto} "TODO" -A3 -B3 .

EgorBu commented 5 years ago

# ./lookout/style/format/analyzer.py:
                "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
                            for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
            submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
            # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
            # throw away imprecise classes
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules

should be covered by https://github.com/src-d/style-analyzer/pull/534

EgorBu commented 5 years ago

# ./lookout/style/format/classes.py-
"""Predicted class definitions."""
# TODO(zurk): refactor CLS related code into one class. Related issue:
# https://github.com/src-d/style-analyzer/issues/286

CLS_SPACE = "<space>"

EgorBu commented 5 years ago

# ./lookout/style/format/benchmarks/evaluate_smoke.py- 
       else:
            detected_wrong_fix += 1

    # TODO (zurk): Add proper class for benchmark metrics
    # https://github.com/src-d/style-analyzer/issues/333
    return misdetection, undetected, detected_wrong_fix, detected_correct_fix

related issue: Rename benchmark metric names

EgorBu commented 5 years ago

#./lookout/style/format/benchmarks/evaluate_smoke.py-
    :return: A dictionary with losses and predicted code for "global" and "local" indentation \
             strategies.
    """
    # TODO (zurk): make optional arguments non-optional.
    if (vnodes is None) ^ (fe is None):
        raise ValueError("vnodes and fe should be both None or not None.")
    losses = {}

@zurk , could you explain what should be done here?

EgorBu commented 5 years ago

# ./lookout/style/format/benchmarks/generate_smoke.py-
        writer = csv.DictWriter(index_file, fieldnames=["repo", "style", "from", "to"])
        writer.writeheader()
        for style_name, (pattern, repl) in js_format_rules.items():
            # TODO(zurk): handle custom modification functions as well as regexp.
            pattern = re.compile(pattern)
            for repopath in repopaths:
                repo = Repo(str(repopath))

could be done like this

if isinstance(pattern, str):
    pattern = re.compile(pattern)
    pattern = lambda repl, code: re.sub(pattern, repl, code)
....
new_code = pattern(repl, code)
filepath.write_text(new_code)

EgorBu commented 5 years ago

#./lookout/style/format/benchmarks/general_report.py-
        to_show = to_show[:max_files]

    template = _load_jinja2_template("quality_report.md.jinja2")
    # TODO(vmarkovtsev): move all the logic inside the template
    res = template.render(language=language, ptr=ptr, conf_mat=mat, target_names=target_names,
                          files=to_show, cl_report=c_sorted, cl_report_full=c_report_full,
                          ppcr=ppcr)

it's possible to create dictionary and send it to render - is it your intention, @vmarkovtsev ?

EgorBu commented 5 years ago

#./lookout/style/format/benchmarks/top_repos_quality.py-
from lookout.style.format.benchmarks.general_report import QualityReportAnalyzer

# TODO(zurk): Move REPOSITORIES to ./benchmarks/data/default_repository_list.csv
# format: "repository to-commit from-commit"
REPOSITORIES = [
    "https://github.com/30-seconds/30-seconds-of-code 3a122c9cfcbdc091227879a06a32bc67ccd0d35d c8c60895e80b8bc90583502accdaa339b794609c",  # noqa: E501

EgorBu commented 5 years ago

#./lookout/style/format/benchmarks/top_repos_quality.py-
    if os.path.exists(repository):
        return repository
    # clone repository
    # TODO: use dulwich in the future
    git_dir = os.path.join(storage_dir, get_repo_name(repository))  # location for code
    cmd = "git clone --single-branch --branch master %s %s" % (repository, git_dir)
    subprocess.check_call(cmd.split())

example how to do it and how it should look in the end

from dulwich import porcelain
...

porcelain.clone(repository, git_dir)

EgorBu commented 5 years ago

#./lookout/style/format/benchmarks/top_repos_quality.py-
def calc_weighted_avg(arr: Sequence[Sequence], col: int, weight_col: int = 5) -> float:
    """Calculate average value in `col` weighted by column `weight_col`."""
    numerator, denominator = 0, 0
    # TODO: switch to numpy arrays
    for row in arr:
        numerator += row[col] * row[weight_col]
        denominator += row[weight_col]

solution

arr = numpy.array(arr)
return (arr[:, col] * arr[:, weight_col]).sum() / arr[:, weight_col].sum()

EgorBu commented 5 years ago

#./lookout/style/format/benchmarks/top_repos_quality.py-
def calc_avg(arr: Sequence[Sequence], col: int) -> float:
    """Calculate average value in `col`."""
    numerator, denominator = 0, 0
    # TODO: switch to numpy arrays
    for row in arr:
        numerator += row[col]
        denominator += 1

solution

arr = numpy.array(arr)
return arr[:, col].sum() / arr.shape[0]

EgorBu commented 5 years ago

#./lookout/style/format/benchmarks/top_repos_quality.py-
def _get_model_summary(report: str) -> (int, float):
    """Extract model summary - number of rules and avg. len."""
    data = _get_json_data(report)
    # TODO(vmarkovtsev): address this embarrasing hardcode
    return data["javascript"]["num_rules"], data["javascript"]["avg_rule_len"]

Solution: iterate through available languages and return these parameters for each language

EgorBu commented 5 years ago

#./lookout/style/format/benchmarks/general_report.py-
        vnodes = chain.from_iterable(fix.file_vnodes for fix in fixes)
        ys = numpy.hstack(fix.y for fix in fixes)
        y_pred_pure = numpy.hstack(fix.y_pred_pure for fix in fixes)
        # FIXME(vmarkovtsev): we are taking the first fix here which does not work for >1 language
        return generate_quality_report(
            fixes[0].language, self.model.ptr, vnodes, ys, y_pred_pure, self.config["max_files"],
            fixes[0].feature_extractor.composite_class_representations)

Solution: generate quality report per language - requires refactoring of reports

zurk commented 5 years ago

@vmarkovtsev let's close this one. This issue related to old refactoring phase. Some of the TODOs were solved, but we have a bunch of new TODOs already, but it is another story.

src-d / style-analyzer

Review all TODOs and FIXMEs #526