Split date rows by new year





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty,.everyoneloves__bot-mid-leaderboard:empty{ margin-bottom:0;
}







7















I have data from a hospital with many variables, and also from and to dates for each row, which tells us when each row is "valid". Each row can maximum be valid for a year.



test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
Payment=c(110,110,115,240,255),
from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
)


For the rows that pass from one year to another, I want to split up the rows, such that I end up with two rows instead of the original row, and also manipulate the from_date and to_date, such that I end up with a new dataset looking like this:



  test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
Payment=c(110,110,110,110,115,115,240,255,255),
from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
)


Attempt:



library(lubridate) #for function "year" below
test_desired=test
row=c()
tmp=c()
for(i in 1:nrow(test_desired)){
if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
row = test_desired[i,]
row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
row$to_date = test$to_date[i]
tmp=rbind(tmp,row)

} else next
}
test_desired=rbind(test_desired,tmp)
library(dplyr)
test_desired=arrange(test_desired,ID,from_date)


Is there a more elegant way of doing this, for example with dplyr?










share|improve this question































    7















    I have data from a hospital with many variables, and also from and to dates for each row, which tells us when each row is "valid". Each row can maximum be valid for a year.



    test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
    Payment=c(110,110,115,240,255),
    from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
    to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
    )


    For the rows that pass from one year to another, I want to split up the rows, such that I end up with two rows instead of the original row, and also manipulate the from_date and to_date, such that I end up with a new dataset looking like this:



      test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
    Payment=c(110,110,110,110,115,115,240,255,255),
    from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
    to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
    )


    Attempt:



    library(lubridate) #for function "year" below
    test_desired=test
    row=c()
    tmp=c()
    for(i in 1:nrow(test_desired)){
    if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
    test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
    row = test_desired[i,]
    row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
    row$to_date = test$to_date[i]
    tmp=rbind(tmp,row)

    } else next
    }
    test_desired=rbind(test_desired,tmp)
    library(dplyr)
    test_desired=arrange(test_desired,ID,from_date)


    Is there a more elegant way of doing this, for example with dplyr?










    share|improve this question



























      7












      7








      7


      1






      I have data from a hospital with many variables, and also from and to dates for each row, which tells us when each row is "valid". Each row can maximum be valid for a year.



      test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
      Payment=c(110,110,115,240,255),
      from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
      to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
      )


      For the rows that pass from one year to another, I want to split up the rows, such that I end up with two rows instead of the original row, and also manipulate the from_date and to_date, such that I end up with a new dataset looking like this:



        test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
      Payment=c(110,110,110,110,115,115,240,255,255),
      from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
      to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
      )


      Attempt:



      library(lubridate) #for function "year" below
      test_desired=test
      row=c()
      tmp=c()
      for(i in 1:nrow(test_desired)){
      if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
      test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
      row = test_desired[i,]
      row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
      row$to_date = test$to_date[i]
      tmp=rbind(tmp,row)

      } else next
      }
      test_desired=rbind(test_desired,tmp)
      library(dplyr)
      test_desired=arrange(test_desired,ID,from_date)


      Is there a more elegant way of doing this, for example with dplyr?










      share|improve this question
















      I have data from a hospital with many variables, and also from and to dates for each row, which tells us when each row is "valid". Each row can maximum be valid for a year.



      test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
      Payment=c(110,110,115,240,255),
      from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
      to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
      )


      For the rows that pass from one year to another, I want to split up the rows, such that I end up with two rows instead of the original row, and also manipulate the from_date and to_date, such that I end up with a new dataset looking like this:



        test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
      Payment=c(110,110,110,110,115,115,240,255,255),
      from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
      to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
      )


      Attempt:



      library(lubridate) #for function "year" below
      test_desired=test
      row=c()
      tmp=c()
      for(i in 1:nrow(test_desired)){
      if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
      test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
      row = test_desired[i,]
      row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
      row$to_date = test$to_date[i]
      tmp=rbind(tmp,row)

      } else next
      }
      test_desired=rbind(test_desired,tmp)
      library(dplyr)
      test_desired=arrange(test_desired,ID,from_date)


      Is there a more elegant way of doing this, for example with dplyr?







      r






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited May 19 at 10:26







      Helen

















      asked May 19 at 10:11









      HelenHelen

      4793 silver badges17 bronze badges




      4793 silver badges17 bronze badges
























          5 Answers
          5






          active

          oldest

          votes


















          3














          Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



          library(dplyr)
          library(lubridate)
          library(purrr)

          test %>%
          ungroup() %>% # This isn't necessary if there are no groupings.
          split(rownames(test)) %>%
          map_dfr(function(df){
          if (year(df$from_date) == year(df$to_date)) return(df)
          bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
          mutate(df, from_date = floor_date(to_date, "y"))
          )
          }
          )

          #### OUTPUT ####

          ID Disease Pass Payment from_date to_date
          1 10 P US 110 2008-01-09 2008-12-31
          2 10 P US 110 2009-01-01 2009-01-08
          3 10 P US 110 2009-01-09 2009-12-31
          4 10 P US 110 2010-01-01 2010-01-08
          5 10 P US 115 2010-01-09 2010-12-31
          6 10 P US 115 2011-01-01 2011-01-08
          7 12 D EN 240 2008-01-01 2008-12-31
          8 12 P EN 255 2013-12-31 2013-12-31
          9 12 P EN 255 2014-01-01 2014-12-30


          To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.






          share|improve this answer

































            2














            Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



            cr_date <- function(d1, d2){
            #browser()
            sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
            lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
            function(x) paste0(min(x), '|', max(x)))
            result <- paste0(lst_dates, collapse = ';')
            return(result)
            }

            #Test
            #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
            test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

            library(tidyr)
            separate_rows(test, flag, sep=';') %>%
            separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
            mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


            ID Disease Pass Payment from_date to_date from_date_new to_date_new
            1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
            2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
            3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
            4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
            5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
            6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
            7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
            8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
            9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30





            share|improve this answer


























            • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

              – Helen
              May 19 at 12:20











            • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

              – Helen
              May 19 at 12:28











            • So, one last question, this does not mantain the date format for the new date columns, correct?

              – Helen
              May 19 at 12:54






            • 1





              @Erosennin yes that is correct. See my update which handles this issue.

              – A. Suliman
              May 19 at 13:35



















            2














            This uses only base R.



            First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



            Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



            test3 <- transform(test2, 
            from_year = format(from_date, "%Y"),
            to_year = format(to_date, "%Y"),
            eoy = as.Date(sub("-.*", "-12-31", from_date)))

            nr <- nrow(test2)
            do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
            if (from_year == to_year) test2[i, ]
            else data.frame(ID, Disease, Pass, Payment,
            from_date = c(from_date, eoy+1),
            to_date = c(eoy, to_date)))
            ))


            Note



            Assumed input in reproducible form. As noted above it uses Date class.



            test2 <- transform(test, 
            from_date = as.Date(from_date),
            to_date = as.Date(to_date))





            share|improve this answer































              1














              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



              test %>% 
              rbind(test) %>%
              arrange(ID, from_date) %>%
              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
              filter(from_date - to_date != 1)

              ID Disease Pass Payment from_date to_date
              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0






              share|improve this answer


























              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                – Helen
                May 19 at 12:30











              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                – Lennyy
                May 19 at 12:56











              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                – Helen
                May 19 at 13:16






              • 1





                @Erosennin just add ungroup() %>% below test %>%

                – gersht
                May 19 at 13:31



















              1














              I am just using a data.table which also provides a yearfunction
              and ignore the possibly slow date conversion logic with as.POSIXct.



              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



              library(data.table)  # also provides a "year" function

              setDT(test)

              # Create additional rows for the new year
              additional_rows <- test[year(from_date) < year(to_date), ]
              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

              # Shorten the "from_date" of the affected original rows
              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

              # Create a combined data table as result
              result <- rbind(test, additional_rows)
              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


              which results in



              > result
              ID Disease Pass Payment from_date to_date
              1: 10 P US 110 2008-01-09 2008-12-31
              2: 10 P US 110 2009-01-01 2009-01-08
              3: 10 P US 110 2009-01-09 2009-12-31
              4: 10 P US 110 2010-01-01 2010-01-08
              5: 10 P US 115 2010-01-09 2010-12-31
              6: 10 P US 115 2011-01-01 2011-01-08
              7: 12 D EN 240 2008-01-01 2008-12-31
              8: 12 P EN 255 2013-12-31 2013-12-31
              9: 12 P EN 255 2014-01-01 2014-12-30





              share|improve this answer


























              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                – Helen
                May 19 at 12:25











              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                – R Yoda
                May 19 at 14:43














              Your Answer






              StackExchange.ifUsing("editor", function () {
              StackExchange.using("externalEditor", function () {
              StackExchange.using("snippets", function () {
              StackExchange.snippets.init();
              });
              });
              }, "code-snippets");

              StackExchange.ready(function() {
              var channelOptions = {
              tags: "".split(" "),
              id: "1"
              };
              initTagRenderer("".split(" "), "".split(" "), channelOptions);

              StackExchange.using("externalEditor", function() {
              // Have to fire editor after snippets, if snippets enabled
              if (StackExchange.settings.snippets.snippetsEnabled) {
              StackExchange.using("snippets", function() {
              createEditor();
              });
              }
              else {
              createEditor();
              }
              });

              function createEditor() {
              StackExchange.prepareEditor({
              heartbeatType: 'answer',
              autoActivateHeartbeat: false,
              convertImagesToLinks: true,
              noModals: true,
              showLowRepImageUploadWarning: true,
              reputationToPostImages: 10,
              bindNavPrevention: true,
              postfix: "",
              imageUploader: {
              brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
              contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
              allowUrls: true
              },
              onDemand: true,
              discardSelector: ".discard-answer"
              ,immediatelyShowMarkdownHelp:true
              });


              }
              });














              draft saved

              draft discarded


















              StackExchange.ready(
              function () {
              StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f56206794%2fsplit-date-rows-by-new-year%23new-answer', 'question_page');
              }
              );

              Post as a guest















              Required, but never shown

























              5 Answers
              5






              active

              oldest

              votes








              5 Answers
              5






              active

              oldest

              votes









              active

              oldest

              votes






              active

              oldest

              votes









              3














              Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



              library(dplyr)
              library(lubridate)
              library(purrr)

              test %>%
              ungroup() %>% # This isn't necessary if there are no groupings.
              split(rownames(test)) %>%
              map_dfr(function(df){
              if (year(df$from_date) == year(df$to_date)) return(df)
              bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
              mutate(df, from_date = floor_date(to_date, "y"))
              )
              }
              )

              #### OUTPUT ####

              ID Disease Pass Payment from_date to_date
              1 10 P US 110 2008-01-09 2008-12-31
              2 10 P US 110 2009-01-01 2009-01-08
              3 10 P US 110 2009-01-09 2009-12-31
              4 10 P US 110 2010-01-01 2010-01-08
              5 10 P US 115 2010-01-09 2010-12-31
              6 10 P US 115 2011-01-01 2011-01-08
              7 12 D EN 240 2008-01-01 2008-12-31
              8 12 P EN 255 2013-12-31 2013-12-31
              9 12 P EN 255 2014-01-01 2014-12-30


              To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.






              share|improve this answer






























                3














                Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



                library(dplyr)
                library(lubridate)
                library(purrr)

                test %>%
                ungroup() %>% # This isn't necessary if there are no groupings.
                split(rownames(test)) %>%
                map_dfr(function(df){
                if (year(df$from_date) == year(df$to_date)) return(df)
                bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
                mutate(df, from_date = floor_date(to_date, "y"))
                )
                }
                )

                #### OUTPUT ####

                ID Disease Pass Payment from_date to_date
                1 10 P US 110 2008-01-09 2008-12-31
                2 10 P US 110 2009-01-01 2009-01-08
                3 10 P US 110 2009-01-09 2009-12-31
                4 10 P US 110 2010-01-01 2010-01-08
                5 10 P US 115 2010-01-09 2010-12-31
                6 10 P US 115 2011-01-01 2011-01-08
                7 12 D EN 240 2008-01-01 2008-12-31
                8 12 P EN 255 2013-12-31 2013-12-31
                9 12 P EN 255 2014-01-01 2014-12-30


                To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.






                share|improve this answer




























                  3












                  3








                  3







                  Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



                  library(dplyr)
                  library(lubridate)
                  library(purrr)

                  test %>%
                  ungroup() %>% # This isn't necessary if there are no groupings.
                  split(rownames(test)) %>%
                  map_dfr(function(df){
                  if (year(df$from_date) == year(df$to_date)) return(df)
                  bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
                  mutate(df, from_date = floor_date(to_date, "y"))
                  )
                  }
                  )

                  #### OUTPUT ####

                  ID Disease Pass Payment from_date to_date
                  1 10 P US 110 2008-01-09 2008-12-31
                  2 10 P US 110 2009-01-01 2009-01-08
                  3 10 P US 110 2009-01-09 2009-12-31
                  4 10 P US 110 2010-01-01 2010-01-08
                  5 10 P US 115 2010-01-09 2010-12-31
                  6 10 P US 115 2011-01-01 2011-01-08
                  7 12 D EN 240 2008-01-01 2008-12-31
                  8 12 P EN 255 2013-12-31 2013-12-31
                  9 12 P EN 255 2014-01-01 2014-12-30


                  To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.






                  share|improve this answer















                  Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



                  library(dplyr)
                  library(lubridate)
                  library(purrr)

                  test %>%
                  ungroup() %>% # This isn't necessary if there are no groupings.
                  split(rownames(test)) %>%
                  map_dfr(function(df){
                  if (year(df$from_date) == year(df$to_date)) return(df)
                  bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
                  mutate(df, from_date = floor_date(to_date, "y"))
                  )
                  }
                  )

                  #### OUTPUT ####

                  ID Disease Pass Payment from_date to_date
                  1 10 P US 110 2008-01-09 2008-12-31
                  2 10 P US 110 2009-01-01 2009-01-08
                  3 10 P US 110 2009-01-09 2009-12-31
                  4 10 P US 110 2010-01-01 2010-01-08
                  5 10 P US 115 2010-01-09 2010-12-31
                  6 10 P US 115 2011-01-01 2011-01-08
                  7 12 D EN 240 2008-01-01 2008-12-31
                  8 12 P EN 255 2013-12-31 2013-12-31
                  9 12 P EN 255 2014-01-01 2014-12-30


                  To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.







                  share|improve this answer














                  share|improve this answer



                  share|improve this answer








                  edited May 19 at 13:37

























                  answered May 19 at 13:13









                  gershtgersht

                  2,1961 gold badge2 silver badges11 bronze badges




                  2,1961 gold badge2 silver badges11 bronze badges

























                      2














                      Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



                      cr_date <- function(d1, d2){
                      #browser()
                      sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
                      lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                      function(x) paste0(min(x), '|', max(x)))
                      result <- paste0(lst_dates, collapse = ';')
                      return(result)
                      }

                      #Test
                      #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
                      test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

                      library(tidyr)
                      separate_rows(test, flag, sep=';') %>%
                      separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
                      mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


                      ID Disease Pass Payment from_date to_date from_date_new to_date_new
                      1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
                      2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
                      3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
                      4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
                      5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
                      6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
                      7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
                      8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
                      9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30





                      share|improve this answer


























                      • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                        – Helen
                        May 19 at 12:20











                      • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                        – Helen
                        May 19 at 12:28











                      • So, one last question, this does not mantain the date format for the new date columns, correct?

                        – Helen
                        May 19 at 12:54






                      • 1





                        @Erosennin yes that is correct. See my update which handles this issue.

                        – A. Suliman
                        May 19 at 13:35
















                      2














                      Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



                      cr_date <- function(d1, d2){
                      #browser()
                      sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
                      lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                      function(x) paste0(min(x), '|', max(x)))
                      result <- paste0(lst_dates, collapse = ';')
                      return(result)
                      }

                      #Test
                      #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
                      test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

                      library(tidyr)
                      separate_rows(test, flag, sep=';') %>%
                      separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
                      mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


                      ID Disease Pass Payment from_date to_date from_date_new to_date_new
                      1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
                      2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
                      3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
                      4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
                      5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
                      6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
                      7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
                      8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
                      9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30





                      share|improve this answer


























                      • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                        – Helen
                        May 19 at 12:20











                      • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                        – Helen
                        May 19 at 12:28











                      • So, one last question, this does not mantain the date format for the new date columns, correct?

                        – Helen
                        May 19 at 12:54






                      • 1





                        @Erosennin yes that is correct. See my update which handles this issue.

                        – A. Suliman
                        May 19 at 13:35














                      2












                      2








                      2







                      Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



                      cr_date <- function(d1, d2){
                      #browser()
                      sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
                      lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                      function(x) paste0(min(x), '|', max(x)))
                      result <- paste0(lst_dates, collapse = ';')
                      return(result)
                      }

                      #Test
                      #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
                      test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

                      library(tidyr)
                      separate_rows(test, flag, sep=';') %>%
                      separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
                      mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


                      ID Disease Pass Payment from_date to_date from_date_new to_date_new
                      1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
                      2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
                      3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
                      4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
                      5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
                      6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
                      7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
                      8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
                      9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30





                      share|improve this answer















                      Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



                      cr_date <- function(d1, d2){
                      #browser()
                      sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
                      lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                      function(x) paste0(min(x), '|', max(x)))
                      result <- paste0(lst_dates, collapse = ';')
                      return(result)
                      }

                      #Test
                      #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
                      test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

                      library(tidyr)
                      separate_rows(test, flag, sep=';') %>%
                      separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
                      mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


                      ID Disease Pass Payment from_date to_date from_date_new to_date_new
                      1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
                      2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
                      3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
                      4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
                      5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
                      6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
                      7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
                      8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
                      9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30






                      share|improve this answer














                      share|improve this answer



                      share|improve this answer








                      edited May 19 at 13:33

























                      answered May 19 at 10:49









                      A. SulimanA. Suliman

                      7,0634 gold badges13 silver badges26 bronze badges




                      7,0634 gold badges13 silver badges26 bronze badges













                      • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                        – Helen
                        May 19 at 12:20











                      • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                        – Helen
                        May 19 at 12:28











                      • So, one last question, this does not mantain the date format for the new date columns, correct?

                        – Helen
                        May 19 at 12:54






                      • 1





                        @Erosennin yes that is correct. See my update which handles this issue.

                        – A. Suliman
                        May 19 at 13:35



















                      • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                        – Helen
                        May 19 at 12:20











                      • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                        – Helen
                        May 19 at 12:28











                      • So, one last question, this does not mantain the date format for the new date columns, correct?

                        – Helen
                        May 19 at 12:54






                      • 1





                        @Erosennin yes that is correct. See my update which handles this issue.

                        – A. Suliman
                        May 19 at 13:35

















                      This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                      – Helen
                      May 19 at 12:20





                      This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                      – Helen
                      May 19 at 12:20













                      Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                      – Helen
                      May 19 at 12:28





                      Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                      – Helen
                      May 19 at 12:28













                      So, one last question, this does not mantain the date format for the new date columns, correct?

                      – Helen
                      May 19 at 12:54





                      So, one last question, this does not mantain the date format for the new date columns, correct?

                      – Helen
                      May 19 at 12:54




                      1




                      1





                      @Erosennin yes that is correct. See my update which handles this issue.

                      – A. Suliman
                      May 19 at 13:35





                      @Erosennin yes that is correct. See my update which handles this issue.

                      – A. Suliman
                      May 19 at 13:35











                      2














                      This uses only base R.



                      First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



                      Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



                      test3 <- transform(test2, 
                      from_year = format(from_date, "%Y"),
                      to_year = format(to_date, "%Y"),
                      eoy = as.Date(sub("-.*", "-12-31", from_date)))

                      nr <- nrow(test2)
                      do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
                      if (from_year == to_year) test2[i, ]
                      else data.frame(ID, Disease, Pass, Payment,
                      from_date = c(from_date, eoy+1),
                      to_date = c(eoy, to_date)))
                      ))


                      Note



                      Assumed input in reproducible form. As noted above it uses Date class.



                      test2 <- transform(test, 
                      from_date = as.Date(from_date),
                      to_date = as.Date(to_date))





                      share|improve this answer




























                        2














                        This uses only base R.



                        First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



                        Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



                        test3 <- transform(test2, 
                        from_year = format(from_date, "%Y"),
                        to_year = format(to_date, "%Y"),
                        eoy = as.Date(sub("-.*", "-12-31", from_date)))

                        nr <- nrow(test2)
                        do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
                        if (from_year == to_year) test2[i, ]
                        else data.frame(ID, Disease, Pass, Payment,
                        from_date = c(from_date, eoy+1),
                        to_date = c(eoy, to_date)))
                        ))


                        Note



                        Assumed input in reproducible form. As noted above it uses Date class.



                        test2 <- transform(test, 
                        from_date = as.Date(from_date),
                        to_date = as.Date(to_date))





                        share|improve this answer


























                          2












                          2








                          2







                          This uses only base R.



                          First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



                          Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



                          test3 <- transform(test2, 
                          from_year = format(from_date, "%Y"),
                          to_year = format(to_date, "%Y"),
                          eoy = as.Date(sub("-.*", "-12-31", from_date)))

                          nr <- nrow(test2)
                          do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
                          if (from_year == to_year) test2[i, ]
                          else data.frame(ID, Disease, Pass, Payment,
                          from_date = c(from_date, eoy+1),
                          to_date = c(eoy, to_date)))
                          ))


                          Note



                          Assumed input in reproducible form. As noted above it uses Date class.



                          test2 <- transform(test, 
                          from_date = as.Date(from_date),
                          to_date = as.Date(to_date))





                          share|improve this answer













                          This uses only base R.



                          First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



                          Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



                          test3 <- transform(test2, 
                          from_year = format(from_date, "%Y"),
                          to_year = format(to_date, "%Y"),
                          eoy = as.Date(sub("-.*", "-12-31", from_date)))

                          nr <- nrow(test2)
                          do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
                          if (from_year == to_year) test2[i, ]
                          else data.frame(ID, Disease, Pass, Payment,
                          from_date = c(from_date, eoy+1),
                          to_date = c(eoy, to_date)))
                          ))


                          Note



                          Assumed input in reproducible form. As noted above it uses Date class.



                          test2 <- transform(test, 
                          from_date = as.Date(from_date),
                          to_date = as.Date(to_date))






                          share|improve this answer












                          share|improve this answer



                          share|improve this answer










                          answered May 19 at 17:09









                          G. GrothendieckG. Grothendieck

                          161k11 gold badges143 silver badges252 bronze badges




                          161k11 gold badges143 silver badges252 bronze badges























                              1














                              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



                              test %>% 
                              rbind(test) %>%
                              arrange(ID, from_date) %>%
                              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
                              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
                              filter(from_date - to_date != 1)

                              ID Disease Pass Payment from_date to_date
                              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
                              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
                              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
                              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
                              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
                              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
                              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
                              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
                              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


                              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0






                              share|improve this answer


























                              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                                – Helen
                                May 19 at 12:30











                              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                                – Lennyy
                                May 19 at 12:56











                              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                                – Helen
                                May 19 at 13:16






                              • 1





                                @Erosennin just add ungroup() %>% below test %>%

                                – gersht
                                May 19 at 13:31
















                              1














                              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



                              test %>% 
                              rbind(test) %>%
                              arrange(ID, from_date) %>%
                              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
                              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
                              filter(from_date - to_date != 1)

                              ID Disease Pass Payment from_date to_date
                              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
                              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
                              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
                              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
                              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
                              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
                              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
                              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
                              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


                              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0






                              share|improve this answer


























                              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                                – Helen
                                May 19 at 12:30











                              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                                – Lennyy
                                May 19 at 12:56











                              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                                – Helen
                                May 19 at 13:16






                              • 1





                                @Erosennin just add ungroup() %>% below test %>%

                                – gersht
                                May 19 at 13:31














                              1












                              1








                              1







                              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



                              test %>% 
                              rbind(test) %>%
                              arrange(ID, from_date) %>%
                              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
                              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
                              filter(from_date - to_date != 1)

                              ID Disease Pass Payment from_date to_date
                              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
                              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
                              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
                              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
                              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
                              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
                              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
                              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
                              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


                              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0






                              share|improve this answer















                              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



                              test %>% 
                              rbind(test) %>%
                              arrange(ID, from_date) %>%
                              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
                              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
                              filter(from_date - to_date != 1)

                              ID Disease Pass Payment from_date to_date
                              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
                              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
                              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
                              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
                              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
                              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
                              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
                              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
                              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


                              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0







                              share|improve this answer














                              share|improve this answer



                              share|improve this answer








                              edited May 19 at 12:11

























                              answered May 19 at 11:58









                              LennyyLennyy

                              3,8452 gold badges4 silver badges17 bronze badges




                              3,8452 gold badges4 silver badges17 bronze badges













                              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                                – Helen
                                May 19 at 12:30











                              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                                – Lennyy
                                May 19 at 12:56











                              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                                – Helen
                                May 19 at 13:16






                              • 1





                                @Erosennin just add ungroup() %>% below test %>%

                                – gersht
                                May 19 at 13:31



















                              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                                – Helen
                                May 19 at 12:30











                              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                                – Lennyy
                                May 19 at 12:56











                              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                                – Helen
                                May 19 at 13:16






                              • 1





                                @Erosennin just add ungroup() %>% below test %>%

                                – gersht
                                May 19 at 13:31

















                              I get an error: Error: Column from_date can't be modified because it's a grouping variable

                              – Helen
                              May 19 at 12:30





                              I get an error: Error: Column from_date can't be modified because it's a grouping variable

                              – Helen
                              May 19 at 12:30













                              When you load test from your OP, there is no grouping variable. Else run ungroup first

                              – Lennyy
                              May 19 at 12:56





                              When you load test from your OP, there is no grouping variable. Else run ungroup first

                              – Lennyy
                              May 19 at 12:56













                              Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                              – Helen
                              May 19 at 13:16





                              Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                              – Helen
                              May 19 at 13:16




                              1




                              1





                              @Erosennin just add ungroup() %>% below test %>%

                              – gersht
                              May 19 at 13:31





                              @Erosennin just add ungroup() %>% below test %>%

                              – gersht
                              May 19 at 13:31











                              1














                              I am just using a data.table which also provides a yearfunction
                              and ignore the possibly slow date conversion logic with as.POSIXct.



                              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



                              library(data.table)  # also provides a "year" function

                              setDT(test)

                              # Create additional rows for the new year
                              additional_rows <- test[year(from_date) < year(to_date), ]
                              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

                              # Shorten the "from_date" of the affected original rows
                              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

                              # Create a combined data table as result
                              result <- rbind(test, additional_rows)
                              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


                              which results in



                              > result
                              ID Disease Pass Payment from_date to_date
                              1: 10 P US 110 2008-01-09 2008-12-31
                              2: 10 P US 110 2009-01-01 2009-01-08
                              3: 10 P US 110 2009-01-09 2009-12-31
                              4: 10 P US 110 2010-01-01 2010-01-08
                              5: 10 P US 115 2010-01-09 2010-12-31
                              6: 10 P US 115 2011-01-01 2011-01-08
                              7: 12 D EN 240 2008-01-01 2008-12-31
                              8: 12 P EN 255 2013-12-31 2013-12-31
                              9: 12 P EN 255 2014-01-01 2014-12-30





                              share|improve this answer


























                              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                                – Helen
                                May 19 at 12:25











                              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                                – R Yoda
                                May 19 at 14:43
















                              1














                              I am just using a data.table which also provides a yearfunction
                              and ignore the possibly slow date conversion logic with as.POSIXct.



                              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



                              library(data.table)  # also provides a "year" function

                              setDT(test)

                              # Create additional rows for the new year
                              additional_rows <- test[year(from_date) < year(to_date), ]
                              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

                              # Shorten the "from_date" of the affected original rows
                              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

                              # Create a combined data table as result
                              result <- rbind(test, additional_rows)
                              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


                              which results in



                              > result
                              ID Disease Pass Payment from_date to_date
                              1: 10 P US 110 2008-01-09 2008-12-31
                              2: 10 P US 110 2009-01-01 2009-01-08
                              3: 10 P US 110 2009-01-09 2009-12-31
                              4: 10 P US 110 2010-01-01 2010-01-08
                              5: 10 P US 115 2010-01-09 2010-12-31
                              6: 10 P US 115 2011-01-01 2011-01-08
                              7: 12 D EN 240 2008-01-01 2008-12-31
                              8: 12 P EN 255 2013-12-31 2013-12-31
                              9: 12 P EN 255 2014-01-01 2014-12-30





                              share|improve this answer


























                              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                                – Helen
                                May 19 at 12:25











                              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                                – R Yoda
                                May 19 at 14:43














                              1












                              1








                              1







                              I am just using a data.table which also provides a yearfunction
                              and ignore the possibly slow date conversion logic with as.POSIXct.



                              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



                              library(data.table)  # also provides a "year" function

                              setDT(test)

                              # Create additional rows for the new year
                              additional_rows <- test[year(from_date) < year(to_date), ]
                              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

                              # Shorten the "from_date" of the affected original rows
                              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

                              # Create a combined data table as result
                              result <- rbind(test, additional_rows)
                              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


                              which results in



                              > result
                              ID Disease Pass Payment from_date to_date
                              1: 10 P US 110 2008-01-09 2008-12-31
                              2: 10 P US 110 2009-01-01 2009-01-08
                              3: 10 P US 110 2009-01-09 2009-12-31
                              4: 10 P US 110 2010-01-01 2010-01-08
                              5: 10 P US 115 2010-01-09 2010-12-31
                              6: 10 P US 115 2011-01-01 2011-01-08
                              7: 12 D EN 240 2008-01-01 2008-12-31
                              8: 12 P EN 255 2013-12-31 2013-12-31
                              9: 12 P EN 255 2014-01-01 2014-12-30





                              share|improve this answer















                              I am just using a data.table which also provides a yearfunction
                              and ignore the possibly slow date conversion logic with as.POSIXct.



                              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



                              library(data.table)  # also provides a "year" function

                              setDT(test)

                              # Create additional rows for the new year
                              additional_rows <- test[year(from_date) < year(to_date), ]
                              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

                              # Shorten the "from_date" of the affected original rows
                              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

                              # Create a combined data table as result
                              result <- rbind(test, additional_rows)
                              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


                              which results in



                              > result
                              ID Disease Pass Payment from_date to_date
                              1: 10 P US 110 2008-01-09 2008-12-31
                              2: 10 P US 110 2009-01-01 2009-01-08
                              3: 10 P US 110 2009-01-09 2009-12-31
                              4: 10 P US 110 2010-01-01 2010-01-08
                              5: 10 P US 115 2010-01-09 2010-12-31
                              6: 10 P US 115 2011-01-01 2011-01-08
                              7: 12 D EN 240 2008-01-01 2008-12-31
                              8: 12 P EN 255 2013-12-31 2013-12-31
                              9: 12 P EN 255 2014-01-01 2014-12-30






                              share|improve this answer














                              share|improve this answer



                              share|improve this answer








                              edited May 20 at 6:14

























                              answered May 19 at 10:34









                              R YodaR Yoda

                              4,48820 silver badges49 bronze badges




                              4,48820 silver badges49 bronze badges













                              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                                – Helen
                                May 19 at 12:25











                              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                                – R Yoda
                                May 19 at 14:43



















                              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                                – Helen
                                May 19 at 12:25











                              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                                – R Yoda
                                May 19 at 14:43

















                              I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                              – Helen
                              May 19 at 12:25





                              I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                              – Helen
                              May 19 at 12:25













                              What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                              – R Yoda
                              May 19 at 14:43





                              What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                              – R Yoda
                              May 19 at 14:43


















                              draft saved

                              draft discarded




















































                              Thanks for contributing an answer to Stack Overflow!


                              • Please be sure to answer the question. Provide details and share your research!

                              But avoid



                              • Asking for help, clarification, or responding to other answers.

                              • Making statements based on opinion; back them up with references or personal experience.


                              To learn more, see our tips on writing great answers.




                              draft saved


                              draft discarded














                              StackExchange.ready(
                              function () {
                              StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f56206794%2fsplit-date-rows-by-new-year%23new-answer', 'question_page');
                              }
                              );

                              Post as a guest















                              Required, but never shown





















































                              Required, but never shown














                              Required, but never shown












                              Required, but never shown







                              Required, but never shown

































                              Required, but never shown














                              Required, but never shown












                              Required, but never shown







                              Required, but never shown







                              Popular posts from this blog

                              Bruad Bilen | Luke uk diar | NawigatsjuunCommonskategorii: BruadCommonskategorii: RunstükenWikiquote: Bruad

                              What is the offset in a seaplane's hull?

                              Slayer Innehåll Historia | Stil, komposition och lyrik | Bandets betydelse och framgångar | Sidoprojekt och samarbeten | Kontroverser | Medlemmar | Utmärkelser och nomineringar | Turnéer och festivaler | Diskografi | Referenser | Externa länkar | Navigeringsmenywww.slayer.net”Metal Massacre vol. 1””Metal Massacre vol. 3””Metal Massacre Volume III””Show No Mercy””Haunting the Chapel””Live Undead””Hell Awaits””Reign in Blood””Reign in Blood””Gold & Platinum – Reign in Blood””Golden Gods Awards Winners”originalet”Kerrang! Hall Of Fame””Slayer Looks Back On 37-Year Career In New Video Series: Part Two””South of Heaven””Gold & Platinum – South of Heaven””Seasons in the Abyss””Gold & Platinum - Seasons in the Abyss””Divine Intervention””Divine Intervention - Release group by Slayer””Gold & Platinum - Divine Intervention””Live Intrusion””Undisputed Attitude””Abolish Government/Superficial Love””Release “Slatanic Slaughter: A Tribute to Slayer” by Various Artists””Diabolus in Musica””Soundtrack to the Apocalypse””God Hates Us All””Systematic - Relationships””War at the Warfield””Gold & Platinum - War at the Warfield””Soundtrack to the Apocalypse””Gold & Platinum - Still Reigning””Metallica, Slayer, Iron Mauden Among Winners At Metal Hammer Awards””Eternal Pyre””Eternal Pyre - Slayer release group””Eternal Pyre””Metal Storm Awards 2006””Kerrang! Hall Of Fame””Slayer Wins 'Best Metal' Grammy Award””Slayer Guitarist Jeff Hanneman Dies””Bullet-For My Valentine booed at Metal Hammer Golden Gods Awards””Unholy Aliance””The End Of Slayer?””Slayer: We Could Thrash Out Two More Albums If We're Fast Enough...””'The Unholy Alliance: Chapter III' UK Dates Added”originalet”Megadeth And Slayer To Co-Headline 'Canadian Carnage' Trek”originalet”World Painted Blood””Release “World Painted Blood” by Slayer””Metallica Heading To Cinemas””Slayer, Megadeth To Join Forces For 'European Carnage' Tour - Dec. 18, 2010”originalet”Slayer's Hanneman Contracts Acute Infection; Band To Bring In Guest Guitarist””Cannibal Corpse's Pat O'Brien Will Step In As Slayer's Guest Guitarist”originalet”Slayer’s Jeff Hanneman Dead at 49””Dave Lombardo Says He Made Only $67,000 In 2011 While Touring With Slayer””Slayer: We Do Not Agree With Dave Lombardo's Substance Or Timeline Of Events””Slayer Welcomes Drummer Paul Bostaph Back To The Fold””Slayer Hope to Unveil Never-Before-Heard Jeff Hanneman Material on Next Album””Slayer Debut New Song 'Implode' During Surprise Golden Gods Appearance””Release group Repentless by Slayer””Repentless - Slayer - Credits””Slayer””Metal Storm Awards 2015””Slayer - to release comic book "Repentless #1"””Slayer To Release 'Repentless' 6.66" Vinyl Box Set””BREAKING NEWS: Slayer Announce Farewell Tour””Slayer Recruit Lamb of God, Anthrax, Behemoth + Testament for Final Tour””Slayer lägger ner efter 37 år””Slayer Announces Second North American Leg Of 'Final' Tour””Final World Tour””Slayer Announces Final European Tour With Lamb of God, Anthrax And Obituary””Slayer To Tour Europe With Lamb of God, Anthrax And Obituary””Slayer To Play 'Last French Show Ever' At Next Year's Hellfst””Slayer's Final World Tour Will Extend Into 2019””Death Angel's Rob Cavestany On Slayer's 'Farewell' Tour: 'Some Of Us Could See This Coming'””Testament Has No Plans To Retire Anytime Soon, Says Chuck Billy””Anthrax's Scott Ian On Slayer's 'Farewell' Tour Plans: 'I Was Surprised And I Wasn't Surprised'””Slayer””Slayer's Morbid Schlock””Review/Rock; For Slayer, the Mania Is the Message””Slayer - Biography””Slayer - Reign In Blood”originalet”Dave Lombardo””An exclusive oral history of Slayer”originalet”Exclusive! Interview With Slayer Guitarist Jeff Hanneman”originalet”Thinking Out Loud: Slayer's Kerry King on hair metal, Satan and being polite””Slayer Lyrics””Slayer - Biography””Most influential artists for extreme metal music””Slayer - Reign in Blood””Slayer guitarist Jeff Hanneman dies aged 49””Slatanic Slaughter: A Tribute to Slayer””Gateway to Hell: A Tribute to Slayer””Covered In Blood””Slayer: The Origins of Thrash in San Francisco, CA.””Why They Rule - #6 Slayer”originalet”Guitar World's 100 Greatest Heavy Metal Guitarists Of All Time”originalet”The fans have spoken: Slayer comes out on top in readers' polls”originalet”Tribute to Jeff Hanneman (1964-2013)””Lamb Of God Frontman: We Sound Like A Slayer Rip-Off””BEHEMOTH Frontman Pays Tribute To SLAYER's JEFF HANNEMAN””Slayer, Hatebreed Doing Double Duty On This Year's Ozzfest””System of a Down””Lacuna Coil’s Andrea Ferro Talks Influences, Skateboarding, Band Origins + More””Slayer - Reign in Blood””Into The Lungs of Hell””Slayer rules - en utställning om fans””Slayer and Their Fans Slashed Through a No-Holds-Barred Night at Gas Monkey””Home””Slayer””Gold & Platinum - The Big 4 Live from Sofia, Bulgaria””Exclusive! Interview With Slayer Guitarist Kerry King””2008-02-23: Wiltern, Los Angeles, CA, USA””Slayer's Kerry King To Perform With Megadeth Tonight! - Oct. 21, 2010”originalet”Dave Lombardo - Biography”Slayer Case DismissedArkiveradUltimate Classic Rock: Slayer guitarist Jeff Hanneman dead at 49.”Slayer: "We could never do any thing like Some Kind Of Monster..."””Cannibal Corpse'S Pat O'Brien Will Step In As Slayer'S Guest Guitarist | The Official Slayer Site”originalet”Slayer Wins 'Best Metal' Grammy Award””Slayer Guitarist Jeff Hanneman Dies””Kerrang! Awards 2006 Blog: Kerrang! Hall Of Fame””Kerrang! Awards 2013: Kerrang! Legend”originalet”Metallica, Slayer, Iron Maien Among Winners At Metal Hammer Awards””Metal Hammer Golden Gods Awards””Bullet For My Valentine Booed At Metal Hammer Golden Gods Awards””Metal Storm Awards 2006””Metal Storm Awards 2015””Slayer's Concert History””Slayer - Relationships””Slayer - Releases”Slayers officiella webbplatsSlayer på MusicBrainzOfficiell webbplatsSlayerSlayerr1373445760000 0001 1540 47353068615-5086262726cb13906545x(data)6033143kn20030215029