Examples: SMR04 data

library(eider)
library(magrittr)

This series of vignettes in the Gallery section aim to demonstrate the functionality of eider through examples that are similar to real-life usage. To do this, we have created a series of randomly generated datasets that are stored with the package. You can access these datasets using the eider_example() function, which will return the path to where the dataset is stored in your installation of R.

smr04_data_filepath <- eider_example("random_smr04_data.csv")

smr04_data_filepath
#> [1] "/tmp/RtmpxmnFiA/Rinstb695308b1/eider/extdata/random_smr04_data.csv"

The data

In this specific vignette, we are using simulated SMR04 data. Our dataset does not contain every column specified in here, but serves as a useful example of how real-life data may be treated using eider.

smr04_data <- utils::read.csv(smr04_data_filepath) %>%
  dplyr::mutate(
    admission_date = lubridate::ymd(admission_date),
    discharge_date = lubridate::ymd(discharge_date)
  )

dplyr::glimpse(smr04_data)
#> Rows: 217
#> Columns: 7
#> $ id                 <int> 1, 3, 3, 3, 11, 11, 2, 19, 19, 19, 16, 16, 4, 4, 16…
#> $ admission_date     <date> 2015-07-15, 2016-05-03, 2016-05-04, 2016-05-05, 20…
#> $ discharge_date     <date> 2015-07-15, 2016-05-04, 2016-05-05, 2016-05-06, 20…
#> $ cis_marker         <int> 26, 20, 20, 20, 33, 33, 56, 70, 70, 70, 59, 59, 67,…
#> $ episode_within_cis <int> 1, 1, 2, 3, 1, 2, 1, 1, 2, 3, 1, 2, 1, 2, 1, 2, 3, …
#> $ admission_type     <int> 34, 19, NA, NA, 12, NA, 34, 40, NA, NA, 11, NA, 20,…
#> $ specialty          <chr> "G22", "G61", "G21", "G4", "G63", "G6", "G22", "G1"…

(Note that when the data is loaded by eider, the date columns are automatically converted to the date type for you: you do not need to do the manual processing above.)

Each row in this table corresponds to one episode; multiple episodes may be associated with the same stay. This simplified table has 7 columns:

id, which is a numeric patient ID;
admission_date and discharge_date, which are the dates of admission and discharge for each episode;
cis_marker, which is a unique number associated with each stay (note that this is not necessarily unique for different patient IDs);
episode_within_cis, which is the episode number within each stay; and
specialty, which is the specialty code for the episode.

Feature 1: Number of episodes associated with a psychotherapy specialty

We begin with a simple example: counting the number of episodes (i.e. number of rows) associated with a psychotherapy specialty, which corresponds to any of the codes "G6", "G61", "G62", or "G63". This is a fairly straightforward count transformation_type, with a filter to select for those values. We can, in principle, express this as a compound filter with type "or", as the following shows:

{
  "filter": {
    "type": "or",
    "subfilter": {
      "g6": {
        "column": "specialty",
        "type": "in",
        "value": "G6"
      },
      "g61": {
        "column": "specialty",
        "type": "in",
        "value": "G61"
      },
      "g62": {
        "column": "specialty",
        "type": "in",
        "value": "G62"
      },
      "g63": {
        "column": "specialty",
        "type": "in",
        "value": "G63"
      }
    }
  }
}

However, for filters with type "in", eider lets the user specify multiple values to compare against, which is much more compact. The resulting feature definition is as follows:

pt_episodes_filepath <- eider_example("psychotherapy_episodes.json")
writeLines(readLines(pt_episodes_filepath))
#> {
#>   "source_table": "smr04",
#>   "transformation_type": "count",
#>   "grouping_column": "id",
#>   "absent_default_value": 0,
#>   "output_feature_name": "num_psychotherapy_episodes",
#>   "filter": {
#>     "column": "specialty",
#>     "type": "in",
#>     "value": [
#>       "G6",
#>       "G61",
#>       "G62",
#>       "G63"
#>     ]
#>   }
#> }

res <- run_pipeline(
  data_sources = list(smr04 = smr04_data_filepath),
  feature_filenames = pt_episodes_filepath
)

dplyr::glimpse(res$features)
#> Rows: 20
#> Columns: 2
#> $ id                         <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
#> $ num_psychotherapy_episodes <int> 3, 5, 2, 3, 5, 1, 1, 3, 0, 2, 1, 6, 4, 3, 9…

Feature 2: Number of stays associated with a psychotherapy specialty

Next, we will count the number of stays associated with a psychotherapy specialty. This is slightly more complicated, as each row corresponds to an episode, not a stay: thus, we cannot simply count the number of rows in the table. Instead, we will need to count the number of unique cis_marker values associated with a psychotherapy specialty, as each cis_marker corresponds to a different stay. This means a transformation type of "nunique", and an aggregation column of "cis_marker".

pt_stays_filepath <- eider_example("psychotherapy_stays.json")
writeLines(readLines(pt_stays_filepath))
#> {
#>   "source_table": "smr04",
#>   "transformation_type": "nunique",
#>   "grouping_column": "id",
#>   "absent_default_value": 0,
#>   "aggregation_column": "cis_marker",
#>   "output_feature_name": "num_psychotherapy_stays",
#>   "filter": {
#>     "column": "specialty",
#>     "type": "in",
#>     "value": [
#>       "G6",
#>       "G61",
#>       "G62",
#>       "G63"
#>     ]
#>   }
#> }

res <- run_pipeline(
  data_sources = list(smr04 = smr04_data_filepath),
  feature_filenames = pt_stays_filepath
)

dplyr::glimpse(res$features)
#> Rows: 20
#> Columns: 2
#> $ id                      <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, …
#> $ num_psychotherapy_stays <int> 2, 4, 2, 3, 4, 1, 1, 2, 0, 2, 1, 4, 3, 2, 6, 1…

Interlude: Adding data sources on-the-fly

The last three features will be concerned with the number of days spent in hospital. In principle, this can be easily calculated from the data we already have: by taking the discharge date and subtracting the admission date, we can obtain the length of each episode.

eider does not yet possess the functionality to preprocess tables in this way (by adding new columns). However, the package does allow you to perform this calculation yourself (e.g. using dplyr), and then add the new table to the pipeline as a new data source. Specifically, data sources do not necessarily need to be CSV filenames; they can simply be data frames themselves.

Let’s construct this new data frame:

smr04_with_days_data <- smr04_data %>%
  dplyr::mutate(days_in_hospital = as.numeric(discharge_date - admission_date))

dplyr::glimpse(smr04_with_days_data)
#> Rows: 217
#> Columns: 8
#> $ id                 <int> 1, 3, 3, 3, 11, 11, 2, 19, 19, 19, 16, 16, 4, 4, 16…
#> $ admission_date     <date> 2015-07-15, 2016-05-03, 2016-05-04, 2016-05-05, 20…
#> $ discharge_date     <date> 2015-07-15, 2016-05-04, 2016-05-05, 2016-05-06, 20…
#> $ cis_marker         <int> 26, 20, 20, 20, 33, 33, 56, 70, 70, 70, 59, 59, 67,…
#> $ episode_within_cis <int> 1, 1, 2, 3, 1, 2, 1, 1, 2, 3, 1, 2, 1, 2, 1, 2, 3, …
#> $ admission_type     <int> 34, 19, NA, NA, 12, NA, 34, 40, NA, NA, 11, NA, 20,…
#> $ specialty          <chr> "G22", "G61", "G21", "G4", "G63", "G6", "G22", "G1"…
#> $ days_in_hospital   <dbl> 0, 1, 1, 1, 2, 0, 0, 2, 2, 2, 0, 0, 4, 0, 2, 1, 0, …

In the subsequent sections, we’ll provide this new data frame as a data source to run_pipeline().

Feature 3: Total number of days spent in hospital

With this new column, we can now calculate the total number of days each patient has spent in hospital. This just requires a sum transformation, where we act on the column that we just added, called days_in_hospital.

total_days_filepath <- eider_example("days_in_smr04.json")
writeLines(readLines(total_days_filepath))
#> {
#>   "source_table": "smr04_with_days",
#>   "transformation_type": "sum",
#>   "grouping_column": "id",
#>   "absent_default_value": 0,
#>   "aggregation_column": "days_in_hospital",
#>   "output_feature_name": "total_days_in_hospital"
#> }

Notice that the feature above specifies a different "source_table". This new identifier can then be passed to run_pipeline(), together with the data frame that we calculated above.

res <- run_pipeline(
  data_sources = list(smr04_with_days = smr04_with_days_data),
  feature_filenames = total_days_filepath
)

dplyr::glimpse(res$features)
#> Rows: 20
#> Columns: 2
#> $ id                     <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
#> $ total_days_in_hospital <dbl> 6, 15, 6, 8, 13, 5, 12, 9, 3, 7, 7, 7, 11, 6, 1…

Feature 4: Longest single stay in hospital

In this feature, we are going to calculate the longest single stay in hospital for each patient. To do this, we need to add up the number of days spent in each episode within the same stay, before we take the maximum of these values.

Because the overall action is to take the maximum, the max transformation type is appropriate here. However, the summation must be accomplished through a preprocessing step. In this step, we need to group the data on the id and cis_marker columns, and then replace the values of days_in_hospital with the sums of the days for all episodes. This will give us a table where each row still corresponds to an episode, but the days_in_hospital column has been modified to contain values for each stay.

For more details on preprocessing, see the corresponding vignette.

longest_stay_filepath <- eider_example("longest_stay.json")
writeLines(readLines(longest_stay_filepath))
#> {
#>   "source_table": "smr04_with_days",
#>   "transformation_type": "max",
#>   "grouping_column": "id",
#>   "absent_default_value": 0,
#>   "preprocess": {
#>     "on": [
#>       "id",
#>       "cis_marker"
#>     ],
#>     "replace_with_sum": [
#>       "days_in_hospital"
#>     ]
#>   },
#>   "aggregation_column": "days_in_hospital",
#>   "output_feature_name": "longest_stay"
#> }

res <- run_pipeline(
  data_sources = list(smr04_with_days = smr04_with_days_data),
  feature_filenames = longest_stay_filepath
)

dplyr::glimpse(res$features)
#> Rows: 20
#> Columns: 2
#> $ id           <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
#> $ longest_stay <dbl> 4, 5, 4, 3, 4, 3, 5, 6, 1, 3, 5, 3, 6, 4, 4, 3, 7, 7, 7, 6

Putting it all together

Just for good measure, let’s run the entire pipeline with all four of the features above in one go.

res <- run_pipeline(
  data_sources = list(
    smr04 = smr04_data_filepath,
    smr04_with_days = smr04_with_days_data
  ),
  feature_filenames = c(
    pt_episodes_filepath,
    pt_stays_filepath,
    total_days_filepath,
    longest_stay_filepath
  )
)

dplyr::glimpse(res$features)
#> Rows: 20
#> Columns: 5
#> $ id                         <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
#> $ num_psychotherapy_episodes <int> 3, 5, 2, 3, 5, 1, 1, 3, 0, 2, 1, 6, 4, 3, 9…
#> $ num_psychotherapy_stays    <int> 2, 4, 2, 3, 4, 1, 1, 2, 0, 2, 1, 4, 3, 2, 6…
#> $ total_days_in_hospital     <dbl> 6, 15, 6, 8, 13, 5, 12, 9, 3, 7, 7, 7, 11, …
#> $ longest_stay               <dbl> 4, 5, 4, 3, 4, 3, 5, 6, 1, 3, 5, 3, 6, 4, 4…