── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.0 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ purrr::%@%() masks rlang::%@%()
✖ dplyr::filter() masks stats::filter()
✖ purrr::flatten() masks rlang::flatten()
✖ purrr::flatten_chr() masks rlang::flatten_chr()
✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
✖ purrr::flatten_int() masks rlang::flatten_int()
✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
✖ purrr::flatten_raw() masks rlang::flatten_raw()
✖ purrr::invoke() masks rlang::invoke()
✖ dplyr::lag() masks stats::lag()
✖ purrr::splice() masks rlang::splice()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'vroom'
The following objects are masked from 'package:readr':
as.col_spec, col_character, col_date, col_datetime, col_double,
col_factor, col_guess, col_integer, col_logical, col_number,
col_skip, col_time, cols, cols_condense, cols_only, date_names,
date_names_lang, date_names_langs, default_locale, fwf_cols,
fwf_empty, fwf_positions, fwf_widths, locale, output_column,
problems, spec
Attaching package: 'arrow'
The following object is masked from 'package:lubridate':
duration
The following object is masked from 'package:rlang':
string
The following object is masked from 'package:utils':
timestamp
Attaching package: 'jsonlite'
The following object is masked from 'package:purrr':
flatten
The following objects are masked from 'package:rlang':
flatten, unbox
Attaching package: 'tidyjson'
The following object is masked from 'package:jsonlite':
read_json
The following object is masked from 'package:stats':
filter
Attaching package: 'scales'
The following object is masked from 'package:vroom':
col_factor
The following object is masked from 'package:purrr':
discard
The following object is masked from 'package:readr':
col_factor
Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':
combine
ggbreak v0.1.2
If you use ggbreak in published research, please cite the following
paper:
S Xu, M Chen, T Feng, L Zhan, L Zhou, G Yu. Use ggbreak to effectively
utilize plotting space to deal with large datasets and outliers.
Frontiers in Genetics. 2021, 12:774846. doi: 10.3389/fgene.2021.774846
Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':
group_rows
Attaching package: 'janitor'
The following objects are masked from 'package:stats':
chisq.test, fisher.test
Attaching package: 'maps'
The following object is masked from 'package:purrr':
map
2.4 Download the IRS spreadsheets (if the code chunk’s eval option is set to TRUE)
Code
# CAUTION: only re-download spreadsheets (set eval: true) if they have changed or been deletedtabmetaf <-function(upath){print(upath)download.file(url=upath, destfile=path(irsd, path_file(upath)), mode="wb")}walk(tabmeta$upath, f) # walk through the list of paths, downloading and saving each file
2.5 Functions to read IRS spreadsheets
2.5.1 Function to read mapping tab for an IRS spreadsheet (found in target_recipes.xlsx)
Code
# tab <- "tab11"get_rowmap <-function(tab){# reads the target_recipes.xlsx file to# get start and end row for key data for each year of a particular IRS spreadsheet# from its associated mapping tab in the recipes file# assumes dd (data directory) and targfn (targets filename) are in the environment sheet <-paste0(tab, "_map")read_excel(path(dd, targfn), sheet=sheet, range=cellranger::cell_rows(1:3)) |>pivot_longer(-rowtype, values_to ="xlrownum") |>separate_wider_delim(name, delim="_", names=c("datatype", "year")) |>mutate(table=tab,year=as.integer(year),xlrownum=as.integer(xlrownum)) |>select(table, datatype, year, rowtype, xlrownum) |>arrange(table, year, datatype, desc(rowtype))}xlcols <-function(n) {# create a vector of letters in the order that Excel uses# a helper function that allows us to put letter column names on a dataframe# that was read from an Excel file# usage: # xlcols(53) # gets the letters for the first 53 columns in a spreadsheet# only good for 1- and 2-letter columns, or 26 + 26 x 26 = 702 columns xl_letters <-c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS, sep ="")))return(xl_letters[1:n])}get_colmap <-function(tab){# reads the target_recipes.xlsx file to# get columns of interest for each year of a particular IRS spreadsheet,# from its associated mapping tab in the recipes file# assumes dd (data directory), targfn (targets filename), and allcols are in the environment sheet <-paste0(tab, "_map") col_map <-read_excel(path(dd, targfn), sheet=sheet, skip=3) |>pivot_longer(-c(vname, description, units, notes), values_to ="xlcolumn") |>separate_wider_delim(name, delim="_", names=c("datatype", "year")) |>mutate(table=tab,year=as.integer(year),xl_colnumber=match(xlcolumn, allcols)) |>select(table, datatype, year, xl_colnumber, xlcolumn, vname, description, units, notes) |>filter(!is.na(xlcolumn), !is.na(vname)) |>arrange(table, datatype, year, xl_colnumber) col_map}# get_colmap("tab11")
2.6 Read IRS spreadsheets and save data frame of targets as csv file
Code
allcols <-xlcols(400) # get a large number of potential excel column namesht(allcols)get_rowmap("tab11")get_rowmap("tab21")# tabs <- c("tab11", "tab12")# tabs <- c("tab11", "tab12", "tab14")tabs <-c("tab11", "tab12", "tab14", "tab21")# get start and end rows for each file of interesttabrows <- tabs |> purrr::map(get_rowmap) |>list_rbind() |>pivot_wider(names_from = rowtype, values_from = xlrownum)tabrowstabcols <- tabs |> purrr::map(get_colmap) |>list_rbind()tabcolstabcols_nested <- tabcols |>summarise(maxcol=max(xl_colnumber),column_letters=list(xlcolumn), vnames=list(vname),.by=c(table, datatype, year))tabcols_nested# tabcols_nested |># unnest(cols=column_letters)# define the tables to gettabget <- tabmeta |>filter(table %in% tabs) |>select(table, year, fname, table_description) |>left_join(tabrows, by =join_by(table, year)) |>left_join(tabcols_nested, by =join_by(table, datatype, year))# fname <- "15in11si.xls"# startrow <- 10; endrow <- 29; maxcol <- 7; column_letters <- list(c("A", "B", "D", "G")); colnames <- list(c("x1", "x2", "x3", "x4"))get_irsdata <-function(fname, startrow, endrow, maxcol, column_letters, vnames){# a single file, a single year, a single datatype (filers or taxable)# print(fname); print(datatype) fpath <-path(irsd, fname)# read relevant rows, but all columns from the first column through the last in columns# we cannot read just the desired columns, due to limitations of read_excel# assumes irsd and allcols exist df1 <-read_excel(fpath, sheet=1, range=cellranger::cell_limits(c(startrow, 1), c(endrow, maxcol)),col_names = allcols[1:maxcol], col_types ="text")# keep desired columns, substitute the passed-in column names for letters, and pivot df2 <- df1 |>select(all_of(unlist(column_letters))) |>setNames(unlist(vnames)) |>mutate(xlrownum=startrow:endrow, incsort=row_number()) |>pivot_longer(cols=-c(incsort, xlrownum, incrange), names_to ="vname", values_to ="ptarget") |>mutate(ptarget=as.numeric(ptarget)) |># every column had better be a number stored as text!relocate(incsort, xlrownum) df2}ptargets_nested <- tabget |># filter(row_number() == 4) |> mutate(targets=get_irsdata(fname, startrow, endrow, maxcol, column_letters, vnames) |>list(), .by=c(fname, datatype))ptargets_nestedptargets_nested |>unnest(col=c(targets))ptargets_nested |>filter(table=="tab21", datatype=="taxable") |>unnest(col=c(targets))ptargets <- ptargets_nested |>select(table, datatype, year, fname, targets, table_description) |>unnest(col=targets) |>left_join(tabcols |>select(table, datatype, year, xlcolumn, vname),by =join_by(table, datatype, year, vname)) |>relocate(xlcolumn, xlrownum, .after=fname) |>arrange(table, year, vname, incsort)saveRDS(ptargets, here::here("data", "potential_targets.rds"))ptargets |>write_csv(here::here("data", "potential_targets.csv"))