Closed baolingz closed 4 years ago
Based on conversation with Lynn
SELECT location_category_description, count(*) FROM doe_lcgms."2019"
GROUP BY location_category_description
ORDER BY COUNT(*) DESC;
SELECT org_level, COUNT() FROM sca_bluebook."2019" GROUP BY org_level ORDER BY COUNT() DESC;
^RECIPE_ENGINE
- Records from LCGMS where District is 75 and/or 95 should be excluded because these are likely special ed / after school records
- None of these records flow into the output table `ceqr_school_buildings.2019`
- Create a file reporting the records from the raw 2018 semi-filtered by org level blue book that did / did not make it into the final 2018 ceqr blue book / lcgms file
WITH bluebook_missing AS( SELECT FROM sca_bluebook."2018" WHERE org_id||bldg_id NOT IN( SELECT org_id||bldg_id FROM sca_bluebook."2018_labs" ) ) SELECT FROM bluebook_missing WHERE (org_level = 'PK' OR org_level = 'PS' OR org_level = 'PSIS' OR org_level = 'IS' OR org_level = 'ISHS' OR org_level = 'HS');
^RECIPE_ENGINE
- Evaluate the 400ish records that would be added from LCGMS into final 2019 CEQR schools data and try to propose a filter
SELECT * FROM ceqr_school_buildings."2019" WHERE source = 'lcgms';
^EDM_DATA
@AmandaDoyle Records from LCGMS where District is 75 and/or 95 should be excluded because these are likely special ed / after school records
^ could you confirm this statement? since there are only 32 school districts
Filters build on @baolingz 's comment above:
SELECT * FROM bluebook
WHERE (org_level = 'PK'
OR org_level = 'PS'
OR org_level = 'PSIS'
OR org_level = 'IS'
OR org_level = 'ISHS'
OR org_level = 'HS')
Filters are hard-coded for things that are static lists. So far, I cannot find a better way of doing this (would need flags for SHSAT-admission schools and citywide G&T), but am open to suggestions!
AND charter IS NULL
AND (organization_name !~* 'ALC'
AND organization_name !~* 'Alternative'
AND organization_name !~* 'Restart'
AND organization_name !~* 'Pathways')
AND organization_name !~* 'Adult'
AND org_id NOT IN ('X445','K449','K430','M692','X696','Q687','R605','M475')
I cannot find the Academy of Talented Scholars on a list of city-wide G&T schools, though it is described as sharing a campus with Brooklyn School of Inquiry
To exclude all:
AND org_id NOT IN ('M539', 'M334', 'K686', 'K682','Q300', 'M012')
To match the possibly erroneous inclusion of TAG Young Scholars (M012):
AND org_id NOT IN ('M539', 'M334', 'K686', 'K682','Q300')
AND organization_name !~* 'pre-k'
This is the converse of the filters above, to recreate all 252 records from the 2018 bluebook not included in 2018 CEQR:
WITH bluebook_missing AS(
SELECT * FROM sca_bluebook."2018"
WHERE org_id||bldg_id NOT IN(
SELECT org_id||bldg_id
FROM sca_bluebook."2018_labs"
)
)
SELECT * FROM bluebook_missing
WHERE (org_level = 'PK'
OR org_level = 'PS'
OR org_level = 'PSIS'
OR org_level = 'IS'
OR org_level = 'ISHS'
OR org_level = 'HS'
)
AND (charter IS NOT NULL
OR (organization_name ~* 'ALC'
OR organization_name ~* 'Alternative'
OR organization_name ~* 'Restart'
OR organization_name ~* 'Pathways'
OR organization_name ~* 'pre-k'
OR organization_name ~* 'Adult')
OR (org_id IN ('X445','K449','K430','M692','X696','Q687','R605','M475', 'M539', 'M334', 'K686', 'K682','Q300', 'M012')))
;
-- Number of records existing in old file shared by Dino
-- but not in the new one: 86
SELECT * FROM doe_lcgms."2019"
WHERE building_code||location_code NOT IN(
SELECT building_code||location_code
FROM doe_lcgms."2019_new"
);
-- Number of records existing in new file shared by Dino
-- but not in the old one: 21
SELECT * FROM doe_lcgms."2019_new"
WHERE building_code||location_code NOT IN(
SELECT building_code||location_code
FROM doe_lcgms."2019"
);
-- Finalized filter
SELECT * FROM sca_bluebook."2018"
WHERE (org_level IS NOT NULL
AND org_level != 'SPED'
AND org_level != 'OTHER')
AND charter IS NULL
AND organization_name !~* 'ALC'
AND organization_name !~* 'Alternative Learning'
AND org_id NOT IN ('Q950','M973')
AND organization_name !~* 'Adult'
AND org_id NOT IN ('X445','K449','K430','M692','X696','Q687','R605','M475')
AND org_id NOT IN ('M539', 'M334', 'K686', 'K682','Q300')
AND organization_name !~* 'pre-k';
-- numebr of records eliminated by new filter
-- but not by the one applied by labs: 0
WITH bluebook_filter AS(
SELECT * FROM sca_bluebook."2018"
WHERE (org_level IS NOT NULL
AND org_level != 'SPED'
AND org_level != 'OTHER')
AND charter IS NULL
AND organization_name !~* 'ALC'
AND organization_name !~* 'Alternative Learning'
AND org_id NOT IN ('Q950','M973')
AND organization_name !~* 'Adult'
AND org_id NOT IN ('X445','K449','K430','M692','X696','Q687','R605','M475')
AND org_id NOT IN ('M539', 'M334', 'K686', 'K682','Q300')
AND organization_name !~* 'pre-k'
)
SELECT * FROM sca_bluebook."2018_labs"
WHERE org_level||bldg_id NOT IN (
SELECT org_level||bldg_id
FROM bluebook_filter
)
;
-- distinct location_code||building_code from lcgms
-- after applying filter: 101
WITH bluebook_filter AS(
SELECT * FROM sca_bluebook."2019"
WHERE (org_level IS NOT NULL
AND org_level != 'SPED'
AND org_level != 'OTHER')
AND charter IS NULL
AND organization_name !~* 'ALC'
AND organization_name !~* 'Alternative Learning'
AND org_id NOT IN ('Q950','M973')
AND organization_name !~* 'Adult'
AND org_id NOT IN ('X445','K449','K430','M692','X696','Q687','R605','M475')
AND org_id NOT IN ('M539', 'M334', 'K686', 'K682','Q300')
AND organization_name !~* 'pre-k'
)
SELECT * FROM doe_lcgms."2019_new"
WHERE location_code||building_code NOT IN (
SELECT org_id||bldg_id
FROM bluebook_filter
)
AND system_code !~* '^75'
AND system_code !~* '^84'
AND building_code !~* ' AF '
AND building_code !~* ' GYM '
;
-- distinct location_code from lcgms
-- after applying filter: 18
WITH bluebook_filter AS(
SELECT * FROM sca_bluebook."2019"
WHERE (org_level IS NOT NULL
AND org_level != 'SPED'
AND org_level != 'OTHER')
AND charter IS NULL
AND organization_name !~* 'ALC'
AND organization_name !~* 'Alternative Learning'
AND org_id NOT IN ('Q950','M973')
AND organization_name !~* 'Adult'
AND org_id NOT IN ('X445','K449','K430','M692','X696','Q687','R605','M475')
AND org_id NOT IN ('M539', 'M334', 'K686', 'K682','Q300')
AND organization_name !~* 'pre-k'
)
SELECT * FROM doe_lcgms."2019_new"
WHERE location_code NOT IN (
SELECT org_id
FROM bluebook_filter
)
AND system_code !~* '^75'
AND system_code !~* '^84'
AND building_code !~* ' AF '
AND building_code !~* ' GYM '
;
-- lcgms filter
SELECT * FROM doe_lcgms."2019_new"
WHERE system_code !~* '^75'
AND system_code !~* '^84'
AND building_code !~* ' AF '
AND building_code !~* ' GYM ';
@lseirup @AmandaDoyle There was a typo in the lcgms filter I used which results in no new schools be added from lcgms in the previous csv table I sent to you. I just updated the queries as above, and below are the takeaways after the correction.
I will implement the logic we discussed during today's meeting and send you the findings regarding any mismatches between our output and the excel file shared by Capital Planning.
@baolingz below are my notes and next steps for the LCGMS / Blue Book datasets
I think an immediate next step would be to have me get up to speed on your workflow so that I can review the current output and work on it next week as needed
1) Using the spreadsheet that Capital Planning shared confirm that new schools are in lcgms and deleted schools aren't 2) Remove records from LCGMS where the OrgID start with 75 or 84, and remove athletic fields and gyms 3) Flag records where grades have changed at a location 4) If an organization has one site in blue book but lcgms shows a different site for that organization then all students are allocated to new site. How often does this occur? 5) If an organization had 2 sites in blue book and now has 1 site in lcgms then we merge all former sites into the new site. How often does this occur?
Finally, write up all assumptions and send to capital planning Output the list of schools that are excluded from blue book and lcgms for capital planning to review How to we handle existing schools with new locations and new sitings? - question for capital planning
-- new schools in lcgms: 15
SELECT distinct on (org_id) * FROM ceqr_school_buildings."2019_all"
WHERE source = 'lcgms'
AND org_id NOT IN(
SELECT org_id
FROM ceqr_school_buildings."2019_all"
WHERE source = 'bluebook'
);
-- new sites in lcgms: 102
SELECT distinct on (org_id, bldg_id) *
FROM ceqr_school_buildings."2019_all"
WHERE source = 'lcgms'
AND org_id||bldg_id NOT IN(
SELECT org_id||bldg_id
FROM ceqr_school_buildings."2019_all"
WHERE source = 'bluebook'
);
-- potential resiting: 84
SELECT distinct on (org_id, bldg_id) *
FROM ceqr_school_buildings."2019_all"
WHERE source = 'lcgms'
AND org_id||bldg_id NOT IN(
SELECT org_id||bldg_id
FROM ceqr_school_buildings."2019_all"
WHERE source = 'bluebook'
)
AND org_id IN(
SELECT distinct org_id
FROM ceqr_school_buildings."2019_all"
WHERE source = 'lcgms'
AND org_id IN(
SELECT org_id
FROM ceqr_school_buildings."2019_all"
WHERE source = 'bluebook'
)
AND org_id||bldg_id NOT IN(
SELECT org_id||bldg_id
FROM ceqr_school_buildings."2019_all"
WHERE source = 'bluebook'
)
)
;
-- potential mergers: 13
SELECT distinct on (org_id, bldg_id) *
FROM ceqr_school_buildings."2019_all"
WHERE source = 'bluebook'
AND bldg_id IN(
SELECT bldg_id
FROM ceqr_school_buildings."2019_all"
WHERE source = 'lcgms'
)
AND org_id||bldg_id NOT IN(
SELECT org_id||bldg_id
FROM ceqr_school_buildings."2019_all"
WHERE source = 'lcgms'
)
;
-- Bluebook filter
SELECT a.*,'bluebook' AS source, ST_TRANSFORM(ST_SetSRID(ST_MakePoint(a.x::NUMERIC, a.y::NUMERIC),2263),4326) AS geom
FROM sca_bluebook."2019" a
-- exclude NULL, Special Ed, and Other ord levels
WHERE a.org_id IS NOT NULL
AND(a.org_level IS NOT NULL
AND a.org_level != 'SPED'
AND a.org_level != 'OTHER')
-- exclude charter schools
AND a.charter IS NULL
-- exclude alternative high schools. Need definitive list from captial planning
AND upper(a.organization_name) !~* 'ALC'
AND upper(a.organization_name) !~* 'ALTERNATIVE LEARNING'
AND a.org_id NOT IN ('Q950','M973')
-- exclude adult learning centers and adult continuing education
AND a.organization_name !~* 'Adult'
-- exclude competitive high schools. Need definitive list from captial planning
AND a.org_id NOT IN ('X445','K449','K430','M692','X696','Q687','R605','M475')
-- exclude citywide gifted and talented schools. Need definitive list from captial planning
AND a.org_id NOT IN ('M539', 'M334', 'K686', 'K682','Q300', 'M012')
-- exclude pre k records
AND a.organization_name !~* 'pre-k';
-- LCGMS filter
SELECT a.*, 'lcgms' AS source, ST_SetSRID(ST_MakePoint(REPLACE(a.longitude,'NULL', '0')::NUMERIC,REPLACE(a.latitude,'NULL', '0')::NUMERIC),4326) AS geom
FROM doe_lcgms."2019_new" a
-- exclude NULL, Early Childhood, and Ungraded org levels
WHERE (a.location_category_description IS NOT NULL
AND a.location_category_description != 'Early Childhood'
AND a.location_category_description != 'Ungraded')
-- exclude charter schools. Exclude system_code ~* '^84' also works too
AND a.managed_by_name = 'DOE'
-- exclude Transfer, Career Technical, Special Education, and Home Schools. Exclude system_code ~* '^75' also works to exclude Special Education schools
AND a.location_type_description = 'General Academic'
-- exclude competitive high schools. Need definitive list from captial planning
AND a.location_code NOT IN ('X445','K449','K430','M692','X696','Q687','R605','M475')
-- exclude citywide gifted and talented schools. Need definitive list from captial planning
AND a.location_code NOT IN ('M539', 'M334', 'K686', 'K682','Q300', 'M012')
-- there is nothing to indicate if something is a gym or athletic field. It looks like they're already excluded.
We finalized this pipeline
The ETL pipeline is almost ready, except:
org_level
,ps_enrollment
,ms_enrollment
,hs_enrollment
2019
table has doubled its number of records compared to the2018
and2017
tables. Based on my observation, this issue might get solved after applying the following filter.