RMI-PACTA / pacta.data.preparation

The goal of {pacta.data.preparation} is to prepare and format all input datasets required to run the PACTA for investors tools.
https://rmi-pacta.github.io/pacta.data.preparation/
Other
1 stars 0 forks source link

investigate differences between new AR format and old AR format #8

Open cjyetman opened 1 year ago

cjyetman commented 1 year ago

Especially with the recent change with AR, we're probably going to eventually need to switch to their new format. We should investigate the differences and see if we can switch to using the new format.

library(tidyverse)
library(pacta.data.preparation)

ar_data_path <- "~/Documents/Data/Asset Resolution/2022-08-15_AR_2021Q4"
ar_advanced_company_indicators_path <- file.path(ar_data_path, "2022-08-24_AR_2021Q4_RMI-Company-Indicators.xlsx")
masterdata_debt_path <- file.path(ar_data_path, "2022-10-05_rmi_masterdata_debt_2021q4.csv")
masterdata_ownership_path <- file.path(ar_data_path, "2022-08-15_rmi_masterdata_ownership_2021q4.csv")

ar_advanced_company_indicators <- import_ar_advanced_company_indicators(ar_advanced_company_indicators_path, fix_names = TRUE)
masterdata_debt <- readr::read_csv(masterdata_debt_path, na = "", show_col_types = FALSE)
masterdata_ownership <- readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE)

# -------------------------------------------------------------------------

ownership_data <- 
  ar_advanced_company_indicators %>% 
  filter(consolidation_method == "Equity Ownership") %>% 
  filter(value_type == "production") %>%
  filter(
    asset_sector == "Aviation" & activity_unit %in% c("pkm", "tkm") |
      asset_sector == "Cement" & activity_unit == "t cement" |
      asset_sector == "Coal" & activity_unit == "t coal" |
      asset_sector == "HDV" & activity_unit == "# vehicles" |
      asset_sector == "LDV" & activity_unit == "# vehicles" |
      asset_sector == "Oil&Gas" & activity_unit == "GJ" |
      asset_sector == "Power" & activity_unit == "MW" |
      asset_sector == "Shipping" & activity_unit == "dwt km" |
      asset_sector == "Steel" & activity_unit == "t steel"
  ) %>% 
  pivot_wider(names_from = "year", values_fill = 0) %>% 
  group_by(asset_sector) %>% 
  summarise(new_2016 = sum(`2016`, na.rm = TRUE), new_n = length(unique(company_id)))

masterdata_ownership %>% 
  filter(company_id %in% ar_advanced_company_indicators$company_id) %>% 
  group_by(sector) %>% 
  summarise(old_2016 = sum(`_2016`, na.rm = TRUE), old_n = length(unique(company_id))) %>% 
  full_join(ownership_data, by = c(sector = "asset_sector")) %>% 
  mutate(
    new_2016 = if_else(is.na(new_2016), as.numeric(0), as.numeric(new_2016)),
    new_n = if_else(is.na(new_n), as.numeric(0), as.numeric(new_n))
  ) %>% 
  mutate(diff = old_2016 - new_2016) %>% 
  mutate(percent_diff = round(diff / old_2016 * 100, digits = 2))

#> # A tibble: 9 × 7
#>   sector   old_2016 old_n new_2016 new_n     diff percent_diff
#>   <chr>       <dbl> <int>    <dbl> <dbl>    <dbl>        <dbl>
#> 1 Aviation  6.01e12  1820  6.01e12  1820 -3.12e-2         0   
#> 2 Cement    5.01e 9  2070  4.67e 9  2035  3.46e+8         6.91
#> 3 Coal      1.09e10  2051  1.09e10  2061 -1.47e-2         0   
#> 4 HDV       0         575  0         575  0             NaN   
#> 5 LDV       1.39e 8   394  1.39e 8   435 -3.73e-1         0   
#> 6 Oil&Gas   7.11e11  4563  7.11e11  4563 -5.32e-2         0   
#> 7 Power     1.30e 7 36846  1.30e 7 36875 -3.26e-1         0   
#> 8 Shipping  2.56e14 11167  2.56e14 11169  2.95e+9         0   
#> 9 Steel     2.73e 9  1105  2.73e 9  1105 -5.76e-3         0

# -------------------------------------------------------------------------

fin_control_data <- 
  ar_advanced_company_indicators %>% 
  filter(consolidation_method == "Financial Control") %>% 
  filter(value_type == "production") %>%
  filter(
    asset_sector == "Aviation" & activity_unit %in% c("pkm", "tkm") |
      asset_sector == "Cement" & activity_unit == "t cement" |
      asset_sector == "Coal" & activity_unit == "t coal" |
      asset_sector == "HDV" & activity_unit == "# vehicles" |
      asset_sector == "LDV" & activity_unit == "# vehicles" |
      asset_sector == "Oil&Gas" & activity_unit == "GJ" |
      asset_sector == "Power" & activity_unit == "MW" |
      asset_sector == "Shipping" & activity_unit == "dwt km" |
      asset_sector == "Steel" & activity_unit == "t steel"
  ) %>% 
  pivot_wider(names_from = "year", values_fill = 0) %>% 
  group_by(asset_sector) %>% 
  summarise(new_2016 = sum(`2016`, na.rm = TRUE), new_n = length(unique(company_id)))

masterdata_debt %>% 
  filter(company_id %in% ar_advanced_company_indicators$company_id) %>% 
  group_by(sector) %>% 
  summarise(old_2016 = sum(`_2016`, na.rm = TRUE), old_n = length(unique(company_id))) %>% 
  full_join(fin_control_data, by = c(sector = "asset_sector")) %>% 
  mutate(
    new_2016 = if_else(is.na(new_2016), as.numeric(0), as.numeric(new_2016)),
    new_n = if_else(is.na(new_n), as.numeric(0), as.numeric(new_n))
  ) %>% 
  mutate(diff = old_2016 - new_2016) %>% 
  mutate(percent_diff = round(diff / old_2016 * 100, digits = 2))

#> # A tibble: 9 × 7
#>   sector   old_2016 old_n new_2016 new_n     diff percent_diff
#>   <chr>       <dbl> <int>    <dbl> <dbl>    <dbl>        <dbl>
#> 1 Aviation  3.24e12  1302  5.94e12  1609 -2.70e12        -83.3
#> 2 Cement    2.89e 9  1459  4.27e 9  1685 -1.39e 9        -48.0
#> 3 Coal      6.31e 9  1349  1.12e10  1629 -4.87e 9        -77.2
#> 4 HDV       0         283  0         426  0              NaN  
#> 5 LDV       9.36e 7   194  1.16e 8   300 -2.24e 7        -23.9
#> 6 Oil&Gas   3.06e11  2864  6.85e11  3864 -3.79e11       -124. 
#> 7 Power     6.12e 6 29582  1.28e 7 34012 -6.68e 6       -109. 
#> 8 Shipping  1.70e14  9844  2.53e14 10518 -8.30e13        -48.9
#> 9 Steel     1.66e 9   757  2.70e 9   924 -1.03e 9        -62.1
cjyetman commented 1 year ago

We recently learned that the masterdata_debt CSV is actually showing direct ownership, not rolled up financial control nor credit parent control. Comparing "Direct Ownership" in the Advanced Indicators XLS to the masterdata_debt data gives a much better picture, but we would still rely on the masterdata_debt file to get the credit parent ID to do the credit parent roll-up we currently rely on.

library(tidyverse)
library(pacta.data.preparation)

ar_data_path <- "~/Documents/Data/Asset Resolution/2022-08-15_AR_2021Q4"
ar_advanced_company_indicators_path <- file.path(ar_data_path, "2022-08-24_AR_2021Q4_RMI-Company-Indicators.xlsx")
masterdata_debt_path <- file.path(ar_data_path, "2022-10-05_rmi_masterdata_debt_2021q4.csv")
masterdata_ownership_path <- file.path(ar_data_path, "2022-08-15_rmi_masterdata_ownership_2021q4.csv")

ar_advanced_company_indicators <- import_ar_advanced_company_indicators(ar_advanced_company_indicators_path, fix_names = TRUE)
masterdata_debt <- readr::read_csv(masterdata_debt_path, na = "", show_col_types = FALSE)
masterdata_ownership <- readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE)

# -------------------------------------------------------------------------

direct_ownership_data <- 
  ar_advanced_company_indicators %>% 
  filter(consolidation_method == "Direct Ownership") %>% 
  filter(value_type == "production") %>%
  filter(
    asset_sector == "Aviation" & activity_unit %in% c("pkm", "tkm") |
      asset_sector == "Cement" & activity_unit == "t cement" |
      asset_sector == "Coal" & activity_unit == "t coal" |
      asset_sector == "HDV" & activity_unit == "# vehicles" |
      asset_sector == "LDV" & activity_unit == "# vehicles" |
      asset_sector == "Oil&Gas" & activity_unit == "GJ" |
      asset_sector == "Power" & activity_unit == "MW" |
      asset_sector == "Shipping" & activity_unit == "dwt km" |
      asset_sector == "Steel" & activity_unit == "t steel"
  ) %>% 
  pivot_wider(names_from = "year", values_fill = 0) %>% 
  group_by(asset_sector) %>% 
  summarise(new_2016 = sum(`2016`, na.rm = TRUE), new_n = length(unique(company_id)))

masterdata_debt %>% 
  filter(company_id %in% ar_advanced_company_indicators$company_id) %>% 
  group_by(sector) %>% 
  summarise(old_2016 = sum(`_2016`, na.rm = TRUE), old_n = length(unique(company_id))) %>% 
  full_join(direct_ownership_data, by = c(sector = "asset_sector")) %>% 
  mutate(
    new_2016 = if_else(is.na(new_2016), as.numeric(0), as.numeric(new_2016)),
    new_n = if_else(is.na(new_n), as.numeric(0), as.numeric(new_n))
  ) %>% 
  mutate(diff = old_2016 - new_2016) %>% 
  mutate(percent_diff = round(diff / old_2016 * 100, digits = 2))

#> # A tibble: 9 × 7
#>   sector   old_2016 old_n new_2016 new_n     diff percent_diff
#>   <chr>       <dbl> <int>    <dbl> <dbl>    <dbl>        <dbl>
#> 1 Aviation  3.24e12  1302  3.24e12  1302 -1.46e-3         0   
#> 2 Cement    2.89e 9  1459  2.73e 9  1448  1.53e+8         5.29
#> 3 Coal      6.31e 9  1349  6.31e 9  1359  0               0   
#> 4 HDV       0         283  0         283  0             NaN   
#> 5 LDV       9.36e 7   194  9.36e 7   228  0               0   
#> 6 Oil&Gas   3.06e11  2864  3.06e11  2864 -2.48e-2         0   
#> 7 Power     6.12e 6 29582  6.12e 6 29611 -8.47e-2         0   
#> 8 Shipping  1.70e14  9844  1.70e14  9846 -4.38e-1         0   
#> 9 Steel     1.66e 9   757  1.66e 9   757 -3.81e-3         0
cjyetman commented 1 year ago

The bespoke masterdata_* files seem to include data for Cement - Grinding assets, while the Advanced Company Indicators does not. Filtering those rows out of the masterdata_* files appears to remove the discrepancies apart from very tiny, probable rounding errors and Shipping discrepancy in Equity Ownership (small relative to the sector (% diff), but still substantial). There are also still discrepancies in the number of rows in some sectors which I still do not have an explanation for.

library(tidyverse)
library(pacta.data.preparation)

ar_data_path <- "~/Documents/Data/Asset Resolution/2022-08-15_AR_2021Q4"
ar_advanced_company_indicators_path <- file.path(ar_data_path, "2022-08-24_AR_2021Q4_RMI-Company-Indicators.xlsx")
masterdata_debt_path <- file.path(ar_data_path, "2022-10-05_rmi_masterdata_debt_2021q4.csv")
masterdata_ownership_path <- file.path(ar_data_path, "2022-08-15_rmi_masterdata_ownership_2021q4.csv")

ar_advanced_company_indicators <- import_ar_advanced_company_indicators(ar_advanced_company_indicators_path, fix_names = TRUE)
masterdata_debt <- readr::read_csv(masterdata_debt_path, na = "", show_col_types = FALSE)
masterdata_ownership <- readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE)

# -------------------------------------------------------------------------

equity_ownership_data <- 
  ar_advanced_company_indicators %>% 
  filter(consolidation_method == "Equity Ownership") %>% 
  filter(value_type == "production") %>%
  filter(
    asset_sector == "Aviation" & activity_unit %in% c("pkm", "tkm") |
      asset_sector == "Cement" & activity_unit == "t cement" |
      asset_sector == "Coal" & activity_unit == "t coal" |
      asset_sector == "HDV" & activity_unit == "# vehicles" |
      asset_sector == "LDV" & activity_unit == "# vehicles" |
      asset_sector == "Oil&Gas" & activity_unit == "GJ" |
      asset_sector == "Power" & activity_unit == "MW" |
      asset_sector == "Shipping" & activity_unit == "dwt km" |
      asset_sector == "Steel" & activity_unit == "t steel"
  ) %>% 
  pivot_wider(names_from = "year", values_fill = 0) %>% 
  group_by(asset_sector) %>% 
  summarise(new_2022 = sum(`2022`, na.rm = TRUE), new_n = length(unique(company_id)))

masterdata_ownership %>% 
  filter(technology != "Grinding") %>%
  group_by(sector) %>% 
  summarise(old_2022 = sum(`_2022`, na.rm = TRUE), old_n = length(unique(company_id))) %>% 
  full_join(equity_ownership_data, by = c(sector = "asset_sector")) %>% 
  mutate(
    new_2022 = if_else(is.na(new_2022), as.numeric(0), as.numeric(new_2022)),
    new_n = if_else(is.na(new_n), as.numeric(0), as.numeric(new_n))
  ) %>% 
  mutate(diff = old_2022 - new_2022) %>% 
  mutate(percent_diff = round(diff / old_2022 * 100, digits = 2))

#> # A tibble: 9 × 7
#>   sector   old_2022 old_n new_2022 new_n     diff percent_diff
#>   <chr>       <dbl> <int>    <dbl> <dbl>    <dbl>        <dbl>
#> 1 Aviation  1.00e13  1820  1.00e13  1820  1.17e-2            0
#> 2 Cement    4.67e 9  2035  4.67e 9  2035 -6.73e-4            0
#> 3 Coal      1.15e10  2051  1.15e10  2061 -1.07e-2            0
#> 4 HDV       9.22e 6   575  9.22e 6   575 -6.36e-2            0
#> 5 LDV       1.26e 8   394  1.26e 8   435 -3.52e-1            0
#> 6 Oil&Gas   7.82e11  4568  7.82e11  4563  2.14e-2            0
#> 7 Power     1.60e 7 36917  1.60e 7 36875 -4.49e-1            0
#> 8 Shipping  2.69e14 11167  2.69e14 11169  3.06e+9            0
#> 9 Steel     2.75e 9  1105  2.75e 9  1105 -5.96e-3            0

# -------------------------------------------------------------------------

direct_ownership_data <- 
  ar_advanced_company_indicators %>% 
  filter(consolidation_method == "Direct Ownership") %>% 
  filter(value_type == "production") %>%
  filter(
    asset_sector == "Aviation" & activity_unit %in% c("pkm", "tkm") |
      asset_sector == "Cement" & activity_unit == "t cement" |
      asset_sector == "Coal" & activity_unit == "t coal" |
      asset_sector == "HDV" & activity_unit == "# vehicles" |
      asset_sector == "LDV" & activity_unit == "# vehicles" |
      asset_sector == "Oil&Gas" & activity_unit == "GJ" |
      asset_sector == "Power" & activity_unit == "MW" |
      asset_sector == "Shipping" & activity_unit == "dwt km" |
      asset_sector == "Steel" & activity_unit == "t steel"
  ) %>% 
  pivot_wider(names_from = "year", values_fill = 0) %>% 
  group_by(asset_sector) %>% 
  summarise(new_2022 = sum(`2022`, na.rm = TRUE), new_n = length(unique(company_id)))

masterdata_debt %>% 
  filter(technology != "Grinding") %>%
  group_by(sector) %>% 
  summarise(old_2022 = sum(`_2022`, na.rm = TRUE), old_n = length(unique(company_id))) %>% 
  full_join(direct_ownership_data, by = c(sector = "asset_sector")) %>% 
  mutate(
    new_2022 = if_else(is.na(new_2022), as.numeric(0), as.numeric(new_2022)),
    new_n = if_else(is.na(new_n), as.numeric(0), as.numeric(new_n))
  ) %>% 
  mutate(diff = old_2022 - new_2022) %>% 
  mutate(percent_diff = round(diff / old_2022 * 100, digits = 2))

#> # A tibble: 9 × 7
#>   sector   old_2022 old_n new_2022 new_n     diff percent_diff
#>   <chr>       <dbl> <int>    <dbl> <dbl>    <dbl>        <dbl>
#> 1 Aviation  5.35e12  1302  5.35e12  1302 -0.00684            0
#> 2 Cement    2.74e 9  1448  2.74e 9  1448  0                  0
#> 3 Coal      6.62e 9  1349  6.62e 9  1359  0                  0
#> 4 HDV       3.63e 6   283  3.63e 6   283  0                  0
#> 5 LDV       8.56e 7   194  8.56e 7   228  0                  0
#> 6 Oil&Gas   3.51e11  2869  3.51e11  2864  0.0242             0
#> 7 Power     7.59e 6 29641  7.59e 6 29611 -0.122              0
#> 8 Shipping  1.79e14  9844  1.79e14  9846 -0.0625             0
#> 9 Steel     1.69e 9   757  1.69e 9   757 -0.00299            0
jdhoffa commented 1 year ago

Blocked: Until AI adds a new consolidation method to their advanced company indicators, which likely will happen in Q3 this year Consolidation method is likely to be called "Credit Risk Parent" consolidation method, or something similar.