harvard-lil / capstone

CAP database scripts.
MIT License
188 stars 44 forks source link

Add/remove fields on CasesMetadata convert script #2152

Closed kilbergr closed 1 year ago

kilbergr commented 1 year ago

This PR addresses this ticket to remove Volume, Reporter, court.slug, jurisdiction.slug, jurisdiction.whitelisted and preview fields and this ticket to add first_page_order and last_page_order fields to the CasesMetadata file. This also removes all urls, as we decided those should no longer be included (they reference what will be dead links).

The only difference not due to flake8 in the serializers file is:

class ConvertCaseDocumentSerializer(CaseDocumentSerializer):
    first_page_order = serializers.CharField()
    last_page_order = serializers.CharField()

    def to_representation(self, instance):
        first_page_order = self.context.get("first_page_order")
        last_page_order = self.context.get("last_page_order")

        data = super().to_representation(instance)

        data.pop("reporter")
        data.pop("volume")
        data.pop("url")
        data.pop("frontend_url")
        data.pop("frontend_pdf_url")
        data["court"].pop("slug")
        data["court"].pop("url")
        data["jurisdiction"].pop("slug")
        data["jurisdiction"].pop("whitelisted")
        data["jurisdiction"].pop("url")

        if "preview" in data:
            data.pop("preview")
        data["first_page_order"] = first_page_order
        data["last_page_order"] = last_page_order
        return data

Example results:

{"id": 5208684, "name": "The People of the State of Illinois, ex relatione The Merchants' Savings, Loan and Trust Company of Chicago, v. The Auditor of Public Accounts", "name_abbreviation": "People ex rel. Merchants' Savings, Loan & Trust Co. v. Auditor of Public Accounts", "decision_date": "1863-01", "docket_number": "", "first_page": "434", "last_page": "446", "citations": [{"type": "official", "cite": "30 Ill. 434"}], "court": {"name_abbreviation": "Ill.", "id": 8772, "name": "Illinois Supreme Court"}, "jurisdiction": {"id": 29, "name_long": "Illinois", "name": "Ill."}, "cites_to": [{"cite": "12 Ill. 307", "category": "reporters:state", "reporter": "Ill.", "weight": 2, "pin_cites": [{"page": "316"}], "opinion_id": 0}, {"cite": "11 Ill. 202", "category": "reporters:state", "reporter": "Ill.", "opinion_id": 0}], "analysis": {"cardinality": 1022, "char_count": 27052, "ocr_confidence": 0.563, "pagerank": {"raw": 2.567971502108765e-07, "percentile": 0.8128688244316572}, "sha256": "af739ca52bfe50d170be20891df7c8f5651240aeb11c43f36be106a51e958892", "simhash": "1:3fce2c3a046ffb5f", "word_count": 4714, "random_id": 3094949697, "random_bucket": 12097}, "last_updated": "2023-01-31T18:58:11.873681+00:00", "provenance": {"date_added": "2019-08-29", "source": "Harvard", "batch": "2018"}, "first_page_order": 432, "last_page_order": 444}
{"id": 5208802, "name": "Samuel D. Havely, Plaintiff in Error, v. Francis H. Lowry, Defendant in Error", "name_abbreviation": "Havely v. Lowry", "decision_date": "1863-01", "docket_number": "", "first_page": "446", "last_page": "451", "citations": [{"type": "official", "cite": "30 Ill. 446"}], "court": {"name_abbreviation": "Ill.", "id": 8772, "name": "Illinois Supreme Court"}, "jurisdiction": {"id": 29, "name_long": "Illinois", "name": "Ill."}, "cites_to": [{"cite": "11 Ill. 618", "category": "reporters:state", "reporter": "Ill.", "opinion_id": -1}, {"cite": "24 Ill. 645", "category": "reporters:state", "reporter": "Ill.", "opinion_id": -1}, {"cite": "26 Ill. 368", "category": "reporters:state", "reporter": "Ill.", "opinion_id": -1}, {"cite": "25 Ill. 346", "category": "reporters:state", "reporter": "Ill.", "opinion_id": -1}, {"cite": "23 Ill. 320", "category": "reporters:state", "reporter": "Ill.", "opinion_id": -1}, {"cite": "23 Ill. 382", "category": "reporters:state", "reporter": "Ill.", "opinion_id": -1}, {"cite": "16 Johns. 288", "category": "reporters:state", "reporter": "Johns.", "opinion_id": -1}, {"cite": "14 Wend. 123", "category": "reporters:state", "reporter": "Wend.", "opinion_id": -1}, {"cite": "17 Johns. 128", "category": "reporters:state", "reporter": "Johns.", "opinion_id": -1}, {"cite": "8 Barb. 513", "category": "reporters:state", "reporter": "Barb.", "opinion_id": -1}, {"cite": "25 Ill. 344", "category": "reporters:state", "reporter": "Ill.", "opinion_id": 0}], "analysis": {"cardinality": 547, "char_count": 10473, "ocr_confidence": 0.552, "pagerank": {"raw": 4.033206840060777e-07, "percentile": 0.9052395978858723}, "sha256": "fe46d5ec96d6ea52ebef2e10b10cf71fefaf0cec30bfb5d6e166ecfc136d8954", "simhash": "1:1b3e8030761e3c80", "word_count": 1894, "random_id": 572104776, "random_bucket": 41032}, "last_updated": "2023-01-31T18:58:11.873681+00:00", "provenance": {"date_added": "2019-08-29", "source": "Harvard", "batch": "2018"}, "first_page_order": 444, "last_page_order": 449}
codecov[bot] commented 1 year ago

Codecov Report

Merging #2152 (064e40c) into experimental (9517c06) will decrease coverage by 0.13%. The diff coverage is n/a.

@@               Coverage Diff                @@
##           experimental    #2152      +/-   ##
================================================
- Coverage         62.48%   62.36%   -0.13%     
================================================
  Files               107      107              
  Lines             11695    11724      +29     
================================================
+ Hits               7308     7312       +4     
- Misses             4387     4412      +25     

see 2 files with indirect coverage changes

kilbergr commented 1 year ago

What do you think of this, @matteocargnelutti ?