HughParsonage / grattan

Common quantitative tasks for Australian policy analysts
25 stars 8 forks source link

Update proportion of Australians paying any income tax #170

Closed HughParsonage closed 5 years ago

HughParsonage commented 5 years ago

Update:

The result has been a decline in the proportion of older Australians paying any income tax from 27 per cent in 1995 to 16 per cent in 2014.

for age groups

HughParsonage commented 5 years ago
Age_range number_of_taxpayers persons %_taxpayers %_taxpayers_at_least_this_old
<18 14,542 5,492,297 0.3% 43.4%
18 - 24 992,674 1,860,733 53.3% 56.6%
25 - 29 1,202,706 1,664,620 72.3% 57.0%
30 - 34 1,244,144 1,703,855 73.0% 55.2%
35 - 39 1,119,227 1,561,686 71.7% 52.8%
40 - 44 1,137,549 1,583,259 71.8% 50.2%
45 - 49 1,128,651 1,581,447 71.4% 46.6%
50 - 54 1,073,431 1,523,542 70.5% 41.6%
55 - 59 952,360 1,454,322 65.5% 34.8%
60 - 64 661,796 1,299,411 50.9% 25.8%
65 - 69 312,680 1,189,001 26.3% 17.0%
70 - 74 126,700 887,707 14.3% 12.5%
75+ 184,107 1,600,053 11.5% 11.5%
library(readxl)
library(magrittr)
library(hutils)
library(data.table)
library(knitr)

URL_Table2 <-
  "https://data.gov.au/dataset/d170213c-4391-4d10-ac24-b0c11768da3f/resource/ca3d7752-e689-47fa-a746-20cfee9f2630/download/taxstats2016individual02lodgmentmethodgendertaxablestatusstateage.xlsx"

destfile <- "data-raw/Table2.xlsx"

if (!file.exists("DESCRIPTION") || !dir.exists(basename(destfile))) {
  message("Using non-interactive mode => xlsx file will not be cached")
  destfile <- tempfile(fileext = ".xlsx")
}

res <-
  download.file(URL_Table2,
                destfile = destfile,
                mode = "wb")
if (res) {
  if (HTTP_ERROR <- httr::http_error(URL_Table2)) {
    stop(HTTP_ERROR)
  } else {
    stop("Unable to download \n\t",
         URL_Table2)
  }
}

individuals_table2_201516_raw <-
  read_excel(destfile,
             sheet = "Individuals Table 2A",
             skip = 2) %>%
  as.data.table

individuals_table2_201516 <-
  individuals_table2_201516_raw %>%
  melt.data.table(id.vars = c("Lodgment method",
                              "Gender",
                              "Taxable status",
                              "State / Territory1",
                              "Age range2"),
                  variable.factor = FALSE,
                  variable.name = "raw_variable") %>%
  .[, c("variable", "suffix") := tstrsplit(raw_variable, split = "\r\n")] %>%
  .[, c("variable", "suffix") := lapply(.SD, trimws), .SDcols = c("variable", "suffix")] %>%
  setnames(c("Lodgment method",
             "Gender",
             "Taxable status",
             "State / Territory1",
             "Age range2"),
           c("Lodgment_method",
             "Gender",
             "TaxableStatus",
             "State",
             "Age_range")) %>%
  .[]

number_of_taxpayers <-
  individuals_table2_201516 %>%
  .[variable %ein% "Number of individuals" & TaxableStatus %ein% "Taxable"] %>%
  .[suffix %ein% "no."] %>%
  .[, .(number_of_taxpayers = sum(value)), keyby = "Age_range"] %>%
  .[, Age_range := sub("^[^0-9]+", "", Age_range)] %>%
  .[, c("min_age", "max_age") := tstrsplit(sub("^18$",
                                               "0 - 18",
                                               sub(" and over",
                                                   " - 100",
                                                   Age_range)),
                                           split = " - ",
                                           fixed = TRUE)] %>%
  .[]

persons_by_age <-
  Census2016.DataPack::STE__Age.min %>%
  .[, .(persons = sum(persons)),
    keyby = .(Age_range = cut(Age.min,
                              breaks = c(-Inf, number_of_taxpayers[, unique(max_age)]),
                              labels = number_of_taxpayers[, unique(Age_range)]))] %>%
  .[]

format_numbers <- function(x) {
  if (!is.numeric(x)) {
    return(x)
  }
  if (min(x) > -1 && max(x) < 1) {
    rx <- round(x * 100, 1)

    return(paste0(formatC(rx, width = 3, flag = "#", digits = 1, format = "f",
                          drop0trailing = FALSE),
                  "%"))
  }
  if (max(abs(x)) > 1e3) {
    return(prettyNum(round(x), big.mark = ","))
  }
  x
}
stopifnot(identical(format_numbers(c(0.17, 0.115)),
                    c("17.0%", "11.5%")))

number_of_taxpayers[persons_by_age, on = "Age_range"] %>%
  .[, "%_taxpayers" := number_of_taxpayers / persons] %>%
  .[, "%_taxpayers_at_least_this_old" := rev(cumsum(rev(number_of_taxpayers)) / cumsum(rev(persons)))] %>%
  .[] %>%
  .[, c("min_age", "max_age") := NULL] %>%
  .[Age_range %ein% "18", Age_range := "<18"] %>%
  .[Age_range %ein% "75 and over", Age_range := "75+"] %T>%
  .[, stopifnot(sum(number_of_taxpayers) %between% c(10e6, 11e6),
                sum(persons) %between% c(23e6, 24e6))] %>%
  .[, lapply(.SD, format_numbers)] %>%
  kable(align = "rrrrr")
HughParsonage commented 5 years ago

Proportion of each age group paying income tax

fy_year Age nTaxpayers Population P
2003-04 under 20 618,400 5,342,113 11.6%
2003-04 20 to 24 968,800 1,380,726 70.2%
2003-04 25 to 29 1,009,200 1,343,718 75.1%
2003-04 30 to 34 1,080,100 1,509,100 71.6%
2003-04 35 to 39 1,045,200 1,451,389 72.0%
2003-04 40 to 44 1,097,700 1,530,232 71.7%
2003-04 45 to 49 1,020,100 1,424,317 71.6%
2003-04 50 to 54 882,100 1,310,016 67.3%
2003-04 55 to 59 697,400 1,186,934 58.8%
2003-04 60 to 64 404,500 895,320 45.2%
2003-04 65 to 69 137,700 732,674 18.8%
2003-04 70 and over 246,900 1,826,183 13.5%
2004-05 under 20 643,100 5,362,923 12.0%
2004-05 20 to 24 1,043,700 1,414,648 73.8%
2004-05 25 to 29 997,800 1,352,553 73.8%
2004-05 30 to 34 1,080,400 1,500,518 72.0%
2004-05 35 to 39 1,054,900 1,468,398 71.8%
2004-05 40 to 44 1,122,700 1,527,646 73.5%
2004-05 45 to 49 1,032,700 1,451,092 71.2%
2004-05 50 to 54 925,400 1,325,456 69.8%
2004-05 55 to 59 742,400 1,226,445 60.5%
2004-05 60 to 64 448,600 935,286 48.0%
2004-05 65 to 69 154,000 754,865 20.4%
2004-05 70 and over 271,900 1,857,014 14.6%
2005-06 under 20 666,200 5,395,069 12.3%
2005-06 20 to 24 1,057,200 1,448,435 73.0%
2005-06 25 to 29 1,009,500 1,381,588 73.1%
2005-06 30 to 34 1,103,000 1,474,144 74.8%
2005-06 35 to 39 1,047,400 1,508,828 69.4%
2005-06 40 to 44 1,133,200 1,516,444 74.7%
2005-06 45 to 49 1,061,200 1,477,732 71.8%
2005-06 50 to 54 929,600 1,347,835 69.0%
2005-06 55 to 59 786,300 1,257,992 62.5%
2005-06 60 to 64 478,400 978,835 48.9%
2005-06 65 to 69 169,900 773,121 22.0%
2005-06 70 and over 294,900 1,890,943 15.6%
2006-07 under 20 589,200 5,467,582 10.8%
2006-07 20 to 24 1,067,900 1,483,141 72.0%
2006-07 25 to 29 1,026,000 1,431,018 71.7%
2006-07 30 to 34 1,073,100 1,457,084 73.6%
2006-07 35 to 39 1,073,300 1,555,685 69.0%
2006-07 40 to 44 1,119,800 1,504,232 74.4%
2006-07 45 to 49 1,060,800 1,509,850 70.3%
2006-07 50 to 54 925,100 1,373,058 67.4%
2006-07 55 to 59 809,900 1,254,193 64.6%
2006-07 60 to 64 495,300 1,055,169 46.9%
2006-07 65 to 69 158,500 800,638 19.8%
2006-07 70 and over 270,700 1,935,972 14.0%
2007-08 under 20 613,200 5,548,508 11.1%
2007-08 20 to 24 1,150,400 1,526,362 75.4%
2007-08 25 to 29 1,104,400 1,500,008 73.6%
2007-08 30 to 34 1,135,200 1,458,344 77.8%
2007-08 35 to 39 1,159,300 1,589,553 72.9%
2007-08 40 to 44 1,170,000 1,499,409 78.0%
2007-08 45 to 49 1,140,300 1,537,823 74.2%
2007-08 50 to 54 997,000 1,397,815 71.3%
2007-08 55 to 59 820,100 1,268,595 64.6%
2007-08 60 to 64 494,800 1,117,615 44.3%
2007-08 65 to 69 136,600 827,160 16.5%
2007-08 70 and over 244,700 1,978,007 12.4%
2008-09 under 20 237,500 5,620,478 4.2%
2008-09 20 to 24 967,700 1,581,376 61.2%
2008-09 25 to 29 1,110,000 1,577,309 70.4%
2008-09 30 to 34 1,026,900 1,476,377 69.6%
2008-09 35 to 39 1,105,300 1,604,280 68.9%
2008-09 40 to 44 1,069,900 1,512,063 70.8%
2008-09 45 to 49 1,125,300 1,554,430 72.4%
2008-09 50 to 54 1,004,700 1,430,082 70.3%
2008-09 55 to 59 836,500 1,287,172 65.0%
2008-09 60 to 64 570,600 1,157,520 49.3%
2008-09 65 to 69 151,500 865,863 17.5%
2008-09 70 and over 228,200 2,024,703 11.3%
2009-10 under 20 197,800 5,658,746 3.5%
2009-10 20 to 24 914,000 1,605,054 56.9%
2009-10 25 to 29 1,116,800 1,627,256 68.6%
2009-10 30 to 34 1,042,800 1,498,197 69.6%
2009-10 35 to 39 1,105,100 1,600,546 69.0%
2009-10 40 to 44 1,072,500 1,537,102 69.8%
2009-10 45 to 49 1,095,200 1,554,804 70.4%
2009-10 50 to 54 1,006,800 1,460,583 68.9%
2009-10 55 to 59 845,500 1,308,403 64.6%
2009-10 60 to 64 592,100 1,194,384 49.6%
2009-10 65 to 69 171,600 908,395 18.9%
2009-10 70 and over 211,600 2,078,280 10.2%
2010-11 under 20 74,500 5,687,072 1.3%
2010-11 20 to 24 828,900 1,611,663 51.4%
2010-11 25 to 29 1,134,100 1,658,170 68.4%
2010-11 30 to 34 1,091,500 1,536,161 71.1%
2010-11 35 to 39 1,044,300 1,573,910 66.4%
2010-11 40 to 44 1,129,400 1,587,244 71.2%
2010-11 45 to 49 1,093,500 1,541,837 70.9%
2010-11 50 to 54 1,064,900 1,494,063 71.3%
2010-11 55 to 59 913,300 1,335,993 68.4%
2010-11 60 to 64 675,600 1,226,000 55.1%
2010-11 65 to 69 235,000 954,260 24.6%
2010-11 70 and over 265,600 2,133,651 12.4%
2011-12 under 20 193,400 5,764,298 3.4%
2011-12 20 to 24 946,650 1,628,778 58.1%
2011-12 25 to 29 1,208,700 1,698,768 71.2%
2011-12 30 to 34 1,130,650 1,589,894 71.1%
2011-12 35 to 39 1,093,000 1,557,083 70.2%
2011-12 40 to 44 1,150,000 1,634,211 70.4%
2011-12 45 to 49 1,104,000 1,533,749 72.0%
2011-12 50 to 54 1,088,150 1,523,432 71.4%
2011-12 55 to 59 912,850 1,363,695 66.9%
2011-12 60 to 64 652,200 1,225,704 53.2%
2011-12 65 to 69 304,200 1,022,037 29.8%
2011-12 70 and over 388,900 2,191,816 17.7%
2012-13 under 20 126,150 5,844,248 2.2%
2012-13 20 to 24 839,050 1,645,662 51.0%
2012-13 25 to 29 1,154,750 1,732,064 66.7%
2012-13 30 to 34 1,116,750 1,652,555 67.6%
2012-13 35 to 39 1,055,200 1,552,804 68.0%
2012-13 40 to 44 1,127,050 1,659,367 67.9%
2012-13 45 to 49 1,050,550 1,531,499 68.6%
2012-13 50 to 54 1,051,850 1,547,308 68.0%
2012-13 55 to 59 892,000 1,389,209 64.2%
2012-13 60 to 64 611,100 1,243,035 49.2%
2012-13 65 to 69 231,250 1,079,676 21.4%
2012-13 70 and over 250,200 2,250,702 11.1%
2013-14 under 20 126,850 5,910,431 2.1%
2013-14 20 to 24 834,850 1,661,633 50.2%
2013-14 25 to 29 1,163,500 1,756,185 66.3%
2013-14 30 to 34 1,179,250 1,704,549 69.2%
2013-14 35 to 39 1,068,700 1,555,049 68.7%
2013-14 40 to 44 1,149,750 1,663,547 69.1%
2013-14 45 to 49 1,064,400 1,539,898 69.1%
2013-14 50 to 54 1,064,100 1,561,634 68.1%
2013-14 55 to 59 905,800 1,417,422 63.9%
2013-14 60 to 64 634,900 1,263,190 50.3%
2013-14 65 to 69 250,850 1,118,120 22.4%
2013-14 70 and over 277,400 2,324,028 11.9%

(Wide format)

Age 2003-04 2007-08 2012-13
under 20 11.6% 11.1% 2.2%
20 to 24 70.2% 75.4% 51.0%
25 to 29 75.1% 73.6% 66.7%
30 to 34 71.6% 77.8% 67.6%
35 to 39 72.0% 72.9% 68.0%
40 to 44 71.7% 78.0% 67.9%
45 to 49 71.6% 74.2% 68.6%
50 to 54 67.3% 71.3% 68.0%
55 to 59 58.8% 64.6% 64.2%
60 to 64 45.2% 44.3% 49.2%
65 to 69 18.8% 16.5% 21.4%
70 and over 13.5% 12.4% 11.1%
library(data.table)
library(knitr)
library(readxl)
library(magrittr)
library(ggplot2)
library(taxstats)
library(taxstats1516)

# Table 2 individuals has age and taxable status
individuals_table1_1516.tsv <- "~/taxstats/data-raw/2015-16/individuals_table2_201516.tsv"

if (!file.exists(individuals_table1_1516.tsv)) {
  individuals_table1_1516.tsv <- tempfile(fileext = ".tsv")
  res <- 
    download.file("https://github.com/HughParsonage/taxstats/raw/master/data-raw/2015-16/individuals_table1_201516.tsv",
                  destfile = individuals_table1_1516.tsv,
                  method = "wb")
  if (res) {
    if (HTTP_ERROR <- httr::http_error("https://github.com/HughParsonage/taxstats/raw/master/data-raw/2015-16/individuals_table1_201516.tsv")) {
      stop(HTTP_ERROR)
    } else {
      stop("Unable to download file.")
    }
  }
}

individuals_table1_1516 <-
  fread(file = individuals_table1_1516.tsv, sep = "\t")

individuals_table1_1516 %>% 
  .[startsWith(Selected_items, "18")]

age2age_range <- function(age) {
  if_else(age >= 70, 
          0, 
          pmin(11 - {(age - 15) %/% 5}, 11))
}
stopifnot(age2age_range(65) == 1)
stopifnot(age2age_range(64) == 2)
stopifnot(age2age_range(60) == 2)
stopifnot(age2age_range(24) == 10)
stopifnot(age2age_range(20) == 10)
stopifnot(age2age_range(19) == 11)

aus_pop_by_fy_age_range <- 
  grattan:::aus_pop_fy_age(tbl = TRUE) %>%
  .[, .(Population = sum(Population),
        maxAge = max(Age)),
    keyby = .(fy_year, age_range = age2age_range(Age))]

sfa <- get_sample_files_all2()
sfa[, tax := income_tax(Taxable_Income, fy.year = .BY[[1L]], .dots.ATO = .SD),
    keyby = .(fy_year)]
nTaxpayers_by_fy_age_range <- 
  sfa[tax > 0][, .(nTaxpayers = sum(WEIGHT)), keyby = c("fy_year", "age_range")]

format_numbers <- function(x) {
  if (!is.numeric(x)) {
    return(x)
  }
  if (min(x) > -1 && max(x) < 1) {
    rx <- round(x * 100, 1)

    return(paste0(formatC(rx, width = 3, flag = "#", digits = 1, format = "f",
                          drop0trailing = FALSE),
                  "%"))
  }
  if (max(abs(x)) > 1e3) {
    return(prettyNum(round(x), big.mark = ","))
  }
  x
}

nTaxpayers_by_fy_age_range[aus_pop_by_fy_age_range,
                           on = c("fy_year", "age_range"),
                           nomatch = 0L] %>%
  .[, P := nTaxpayers / Population] %>%
  .[age_range_decoder, on = "age_range"] %>%
  setnames("age_range_description", "Age") %>%
  .[, age_range := NULL] %>%
  .[, maxAge := NULL] %>%
  setkey(fy_year, Age) %>%
  .[, nTaxpayers := prettyNum(nTaxpayers, big.mark = ",")] %>%
  .[, Population := prettyNum(Population, big.mark = ",")] %>%
  .[, lapply(.SD, format_numbers)] %>%
  set_cols_first(key(.)) %>%
  kable(align = "r")

nTaxpayers_by_fy_age_range[aus_pop_by_fy_age_range,
                           on = c("fy_year", "age_range"),
                           nomatch = 0L] %>%
  .[, P := nTaxpayers / Population] %>%
  .[age_range_decoder, on = "age_range"] %>%
  setnames("age_range_description", "Age") %>%
  .[, age_range := NULL] %>%
  .[, maxAge := NULL] %>%
  setkey(fy_year, Age) %>%
  set_cols_first(key(.)) %>%
  .[, lapply(.SD, format_numbers)] %>%
  .[fy_year %in% yr2fy(c(2004, 2008, 2013)),
    .(Age, fy_year, P)] %>%
  dcast(Age ~ fy_year, value.var = "P") %>%
  kable(align = "r")
HughParsonage commented 5 years ago

image

nTaxpayers_by_fy_age_range[aus_pop_by_fy_age_range,
                           on = c("fy_year", "age_range"),
                           nomatch = 0L] %>%
  .[, P := nTaxpayers / Population] %>%
  .[, Year := fy2yr(fy_year)] %>%
  .[age_range_decoder, on = "age_range"] %>%
  ggplot(aes(x = Year, 
             y = P,
             group = age_range_description, 
             color = age_range_description)) + 
  geom_line(size = 1.4) + 
  theme_dark() + 
  scale_y_continuous(labels = format_numbers)