lichess-org / chess-openings

An aggregated data set of chess opening names
Creative Commons Zero v1.0 Universal
379 stars 96 forks source link

gen.py argparse #134

Closed ChessIaCreator closed 1 year ago

ChessIaCreator commented 1 year ago
import io
import re
import sys
import argparse  # Added argparse for better command-line argument parsing
from typing import Dict, List, TextIO

try:
    import chess
    import chess.pgn
except ImportError:
    print("Need python-chess:", file=sys.stderr)
    print("$ pip3 install chess", file=sys.stderr)
    print(file=sys.stderr)
    raise

ECO_REGEX = re.compile(r"^[A-E]\d\d\Z")
INVALID_SPACE = re.compile(r"\s{2,}|^\s|\s\Z|\s,")
INVALID_WITH = re.compile(r"[^,:]\swith\b")

class Stats:
    def __init__(self) -> None:
        self.errors = 0
        self.warnings = 0

class Reporter:
    def __init__(self, stats: Stats, file_name: str) -> None:
        self.stats = stats
        self.file_name = file_name

    def error(self, lno: int, err_msg: str) -> None:
        print(f"::error file={self.file_name},line={lno}::{err_msg}", file=sys.stderr)
        self.stats.errors += 1

    def warning(self, lno: int, err_msg: str) -> None:
        print(f"::warning file={self.file_name},line={lno}::{err_msg}", file=sys.stderr)
        self.stats.warnings += 1

def parse_args():
    parser = argparse.ArgumentParser(description="Chess Opening Data Validator")
    parser.add_argument("input_files", nargs='+', help="Input TSV files to process")
    parser.add_argument("--disable-warnings", action="store_true", help="Disable warning checks")
    return parser.parse_args()

def main(f: TextIO, reporter: Reporter, by_epd: Dict[str, List[str]], shortest_by_name: Dict[str, int]) -> None:
    prev_eco = ""
    prev_name = ""

    for lno, line in enumerate(f, 1):
        cols = line.rstrip("\n").split("\t")

        if len(cols) != 3:
            reporter.error(lno, f"expected 3 columns, got {len(cols)}")
            continue

        if lno == 1:
            if cols != ["eco", "name", "pgn"]:
                reporter.error(lno, f"expected eco, name, pgn")
            continue

        eco, name, pgn = cols

        if not ECO_REGEX.match(eco):
            reporter.error(lno, f"invalid eco")
            continue

        if INVALID_SPACE.search(name):
            reporter.error(lno, f"invalid whitespace in name")
            continue

        try:
            board = chess.pgn.read_game(io.StringIO(pgn), Visitor=chess.pgn.BoardBuilder)
        except ValueError as err:
            reporter.error(lno, f"{err}")
            continue

        if not board:
            reporter.error(lno, f"Empty pgn")
            continue

        allowed_lowers = ["with", "de", "der", "del", "von", "and"]
        if not all([word[0].isupper() for word in re.split(r"\s|-", name) if word not in allowed_lowers and word.isalpha()]):
            reporter.warning(lno, f"{name!r} word(s) beginning with lowercase letters")

        if INVALID_WITH.search(name):
            reporter.warning(lno, f"'with' not separated with ',' or ':'")

        for blacklisted in ["refused"]:
            if blacklisted in name.lower():
                reporter.warning(lno, f"blacklisted word ({blacklisted!r} in {name!r})")

        if shortest_by_name.get(name, -1) == len(board.move_stack):
            reporter.warning(lno, f"{name!r} does not have a unique shortest line")
        try:
            shortest_by_name[name] = min(shortest_by_name[name], len(board.move_stack))
        except KeyError:
            shortest_by_name[name] = len(board.move_stack)

        clean_pgn = chess.Board().variation_san(board.move_stack)
        if clean_pgn != pgn:
            reporter.error(lno, f"unclean pgn: expected {clean_pgn!r}, got {pgn!r}")

        if name.count(":") > 1:
            reporter.error(lno, f"multiple ':' in name: {name}")

        epd = board.epd()
        if epd in by_epd:
            reporter.error(lno, f"duplicate epd: {by_epd[epd]}")
        else:
            by_epd[epd] = cols

        if eco < prev_eco:
            reporter.error(lno, f"not ordered by eco ({eco} after {prev_eco})")
        elif (eco, name) < (prev_eco, prev_name):
            reporter.error(lno, f"not ordered by name ({name!r} after {prev_name!r})")
        prev_eco = eco
        prev_name = name

        print(eco, name, clean_pgn, " ".join(m.uci() for m in board.move_stack), epd, sep="\t")

if __name__ == "__main__":
    args = parse_args()
    print("eco", "name", "pgn", "uci", "epd", sep="\t")

    stats = Stats()
    by_epd: Dict[str, List[str]] = {}
    shortest_by_name: Dict[str, int] = {}
    for file_name in args.input_files:
        with open(file_name) as f:
            main(f, Reporter(stats, file_name), by_epd, shortest_by_name)
    if stats.errors:
        sys.exit(1)
niklasf commented 1 year ago

Hi. Here's how to create a pull request: https://docs.github.com/articles/creating-a-pull-request