2  Get files

2.1 Load packages

Code
source(here::here("r", "libraries.r"))
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ purrr::%@%()         masks rlang::%@%()
✖ dplyr::filter()      masks stats::filter()
✖ purrr::flatten()     masks rlang::flatten()
✖ purrr::flatten_chr() masks rlang::flatten_chr()
✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
✖ purrr::flatten_int() masks rlang::flatten_int()
✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
✖ purrr::flatten_raw() masks rlang::flatten_raw()
✖ purrr::invoke()      masks rlang::invoke()
✖ dplyr::lag()         masks stats::lag()
✖ purrr::splice()      masks rlang::splice()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: 'vroom'


The following objects are masked from 'package:readr':

    as.col_spec, col_character, col_date, col_datetime, col_double,
    col_factor, col_guess, col_integer, col_logical, col_number,
    col_skip, col_time, cols, cols_condense, cols_only, date_names,
    date_names_lang, date_names_langs, default_locale, fwf_cols,
    fwf_empty, fwf_positions, fwf_widths, locale, output_column,
    problems, spec



Attaching package: 'arrow'


The following object is masked from 'package:lubridate':

    duration


The following object is masked from 'package:rlang':

    string


The following object is masked from 'package:utils':

    timestamp



Attaching package: 'jsonlite'


The following object is masked from 'package:purrr':

    flatten


The following objects are masked from 'package:rlang':

    flatten, unbox



Attaching package: 'tidyjson'


The following object is masked from 'package:jsonlite':

    read_json


The following object is masked from 'package:stats':

    filter



Attaching package: 'scales'


The following object is masked from 'package:vroom':

    col_factor


The following object is masked from 'package:purrr':

    discard


The following object is masked from 'package:readr':

    col_factor



Attaching package: 'gridExtra'


The following object is masked from 'package:dplyr':

    combine


ggbreak v0.1.2

If you use ggbreak in published research, please cite the following
paper:

S Xu, M Chen, T Feng, L Zhan, L Zhou, G Yu. Use ggbreak to effectively
utilize plotting space to deal with large datasets and outliers.
Frontiers in Genetics. 2021, 12:774846. doi: 10.3389/fgene.2021.774846


Attaching package: 'kableExtra'


The following object is masked from 'package:dplyr':

    group_rows



Attaching package: 'janitor'


The following objects are masked from 'package:stats':

    chisq.test, fisher.test



Attaching package: 'maps'


The following object is masked from 'package:purrr':

    map
Code
source(here::here("r", "functions.r"))

# E:\R_projects\projects\taxdata-psl\data\IRS spreadsheets

2.2 Define folder names and other constants

Code
dd <- here::here("data")

irsweb <- "https://www.irs.gov/pub/irs-soi/"  # static files
irsd <- path(dd, "IRS spreadsheets")
targfn <- "target_recipes.xlsx"

2.3 Read the target_recipes tab that defines IRS table spreadsheets to download

Code
df1 <- read_excel(path(dd, targfn), sheet="irs_downloads")

tabmeta <- expand_grid(year=c(2015, 2021) |> as.integer(), df1) |> 
  mutate(fname=paste0(str_sub(year, 3, 4), fname_base),
         upath=paste0(baseurl, fname))

glimpse(tabmeta)
tabmeta |> 
  select(1:3)

2.4 Download the IRS spreadsheets (if the code chunk’s eval option is set to TRUE)

Code
# CAUTION: only re-download spreadsheets (set eval: true) if they have changed or been deleted

tabmeta

f <- function(upath){
  print(upath)
  download.file(url=upath, destfile=path(irsd, path_file(upath)), mode="wb")
}

walk(tabmeta$upath, f) # walk through the list of paths, downloading and saving each file

2.5 Functions to read IRS spreadsheets

2.5.1 Function to read mapping tab for an IRS spreadsheet (found in target_recipes.xlsx)

Code
# tab <- "tab11"

get_rowmap <- function(tab){
  # reads the target_recipes.xlsx file to
  # get start and end row for key data for each year of a particular IRS spreadsheet
  # from its associated mapping tab in the recipes file
  # assumes dd (data directory) and targfn (targets filename) are in the environment
  sheet <- paste0(tab, "_map")
  read_excel(path(dd, targfn), sheet=sheet, range=cellranger::cell_rows(1:3)) |> 
    pivot_longer(-rowtype, values_to = "xlrownum") |> 
    separate_wider_delim(name, delim="_", names=c("datatype", "year")) |> 
    mutate(table=tab,
           year=as.integer(year),
           xlrownum=as.integer(xlrownum)) |> 
    select(table, datatype, year, rowtype, xlrownum) |> 
    arrange(table, year, datatype, desc(rowtype))
}

xlcols <- function(n) {
  # create a vector of letters in the order that Excel uses
  # a helper function that allows us to put letter column names on a dataframe
  #   that was read from an Excel file
  # usage: 
  #   xlcols(53) 
  #   gets the letters for the first 53 columns in a spreadsheet
  # only good for 1- and 2-letter columns, or 26 + 26 x 26 = 702 columns
  xl_letters <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS, sep = "")))
  return(xl_letters[1:n])
}

get_colmap <- function(tab){
  # reads the target_recipes.xlsx file to
  # get columns of interest for each year of a particular IRS spreadsheet,
  # from its associated mapping tab in the recipes file
  
  # assumes dd (data directory), targfn (targets filename), and allcols are in the environment
  sheet <- paste0(tab, "_map")
  col_map <- read_excel(path(dd, targfn), sheet=sheet, skip=3) |> 
    pivot_longer(-c(vname, description, units, notes), values_to = "xlcolumn") |> 
    separate_wider_delim(name, delim="_", names=c("datatype", "year")) |> 
    mutate(table=tab,
           year=as.integer(year),
           xl_colnumber=match(xlcolumn, allcols)) |> 
    select(table, datatype, year, xl_colnumber, xlcolumn, vname, description, units, notes) |> 
    filter(!is.na(xlcolumn), !is.na(vname)) |> 
    arrange(table, datatype, year, xl_colnumber)
  col_map
}

# get_colmap("tab11")

2.6 Read IRS spreadsheets and save data frame of targets as csv file

Code
allcols <- xlcols(400) # get a large number of potential excel column names
ht(allcols)

get_rowmap("tab11")
get_rowmap("tab21")


# tabs <- c("tab11", "tab12")
# tabs <- c("tab11", "tab12", "tab14")
tabs <- c("tab11", "tab12", "tab14", "tab21")

# get start and end rows for each file of interest
tabrows <- tabs |> 
  purrr::map(get_rowmap) |> 
  list_rbind() |> 
  pivot_wider(names_from = rowtype, values_from = xlrownum)
tabrows

tabcols <- tabs |> 
  purrr::map(get_colmap) |> 
  list_rbind()
tabcols

tabcols_nested <- tabcols |> 
  summarise(maxcol=max(xl_colnumber),
            column_letters=list(xlcolumn), 
            vnames=list(vname),
            .by=c(table, datatype, year))
tabcols_nested

# tabcols_nested |>
#   unnest(cols=column_letters)

# define the tables to get
tabget <- tabmeta |> 
  filter(table %in% tabs) |> 
  select(table, year, fname, table_description) |> 
  left_join(tabrows, by = join_by(table, year)) |> 
  left_join(tabcols_nested, by = join_by(table, datatype, year))

# fname <- "15in11si.xls"
# startrow <- 10; endrow <- 29; maxcol <- 7; column_letters <- list(c("A", "B", "D", "G")); colnames <- list(c("x1", "x2", "x3", "x4"))


get_irsdata <- function(fname, startrow, endrow, maxcol, column_letters, vnames){
  # a single file, a single year, a single datatype (filers or taxable)
  # print(fname); print(datatype)
  fpath <- path(irsd, fname)
  
  # read relevant rows, but all columns from the first column through the last in columns
  #   we cannot read just the desired columns, due to limitations of read_excel
  
  # assumes irsd and allcols exist
  
  df1 <- read_excel(fpath, sheet=1, 
                    range=cellranger::cell_limits(c(startrow, 1), 
                                                  c(endrow, maxcol)),
                    col_names = allcols[1:maxcol], col_types = "text")
  
  # keep desired columns, substitute the passed-in column names for letters, and pivot
  df2 <- df1 |> 
    select(all_of(unlist(column_letters))) |> 
    setNames(unlist(vnames)) |>
    mutate(xlrownum=startrow:endrow, incsort=row_number()) |> 
    pivot_longer(cols=-c(incsort, xlrownum, incrange), names_to = "vname", values_to = "ptarget") |> 
    mutate(ptarget=as.numeric(ptarget)) |>  # every column had better be a number stored as text!
    relocate(incsort, xlrownum)
  df2
}


ptargets_nested <- tabget |> 
  # filter(row_number() == 4) |> 
  mutate(targets=get_irsdata(fname, startrow, endrow, maxcol, column_letters, vnames) |>
           list(), .by=c(fname, datatype))
ptargets_nested
ptargets_nested |> unnest(col=c(targets))
ptargets_nested |> 
  filter(table=="tab21", datatype=="taxable") |> 
  unnest(col=c(targets))

ptargets <- ptargets_nested |> 
  select(table, datatype, year, fname, targets, table_description) |>
  unnest(col=targets) |> 
  left_join(tabcols |> select(table, datatype, year, xlcolumn, vname),
            by = join_by(table, datatype, year, vname)) |> 
  relocate(xlcolumn, xlrownum, .after=fname) |> 
  arrange(table, year, vname, incsort)

saveRDS(ptargets, here::here("data", "potential_targets.rds"))

ptargets |> 
  write_csv(here::here("data", "potential_targets.csv"))

2.7 Show potential targets

Code
ptargets <- readRDS(here::here("data", "potential_targets.rds"))

library(DT)
# datatable(ptargets, options = list(scrollX = TRUE, scrollY = '400px', paging = TRUE))

datatable(ptargets |> 
            select(-table_description),
          options = list(scrollX = TRUE, scrollY = '1500px', pageLength = 20),
          filter="top") |>
  formatCurrency(columns = c("ptarget"), currency="", digits=0)