Split date rows by new year





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty,.everyoneloves__bot-mid-leaderboard:empty{ margin-bottom:0;
}







7















I have data from a hospital with many variables, and also from and to dates for each row, which tells us when each row is "valid". Each row can maximum be valid for a year.



test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
Payment=c(110,110,115,240,255),
from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
)


For the rows that pass from one year to another, I want to split up the rows, such that I end up with two rows instead of the original row, and also manipulate the from_date and to_date, such that I end up with a new dataset looking like this:



  test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
Payment=c(110,110,110,110,115,115,240,255,255),
from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
)


Attempt:



library(lubridate) #for function "year" below
test_desired=test
row=c()
tmp=c()
for(i in 1:nrow(test_desired)){
if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
row = test_desired[i,]
row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
row$to_date = test$to_date[i]
tmp=rbind(tmp,row)

} else next
}
test_desired=rbind(test_desired,tmp)
library(dplyr)
test_desired=arrange(test_desired,ID,from_date)


Is there a more elegant way of doing this, for example with dplyr?










share|improve this question































    7















    I have data from a hospital with many variables, and also from and to dates for each row, which tells us when each row is "valid". Each row can maximum be valid for a year.



    test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
    Payment=c(110,110,115,240,255),
    from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
    to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
    )


    For the rows that pass from one year to another, I want to split up the rows, such that I end up with two rows instead of the original row, and also manipulate the from_date and to_date, such that I end up with a new dataset looking like this:



      test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
    Payment=c(110,110,110,110,115,115,240,255,255),
    from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
    to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
    )


    Attempt:



    library(lubridate) #for function "year" below
    test_desired=test
    row=c()
    tmp=c()
    for(i in 1:nrow(test_desired)){
    if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
    test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
    row = test_desired[i,]
    row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
    row$to_date = test$to_date[i]
    tmp=rbind(tmp,row)

    } else next
    }
    test_desired=rbind(test_desired,tmp)
    library(dplyr)
    test_desired=arrange(test_desired,ID,from_date)


    Is there a more elegant way of doing this, for example with dplyr?










    share|improve this question



























      7












      7








      7


      1






      I have data from a hospital with many variables, and also from and to dates for each row, which tells us when each row is "valid". Each row can maximum be valid for a year.



      test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
      Payment=c(110,110,115,240,255),
      from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
      to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
      )


      For the rows that pass from one year to another, I want to split up the rows, such that I end up with two rows instead of the original row, and also manipulate the from_date and to_date, such that I end up with a new dataset looking like this:



        test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
      Payment=c(110,110,110,110,115,115,240,255,255),
      from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
      to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
      )


      Attempt:



      library(lubridate) #for function "year" below
      test_desired=test
      row=c()
      tmp=c()
      for(i in 1:nrow(test_desired)){
      if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
      test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
      row = test_desired[i,]
      row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
      row$to_date = test$to_date[i]
      tmp=rbind(tmp,row)

      } else next
      }
      test_desired=rbind(test_desired,tmp)
      library(dplyr)
      test_desired=arrange(test_desired,ID,from_date)


      Is there a more elegant way of doing this, for example with dplyr?










      share|improve this question
















      I have data from a hospital with many variables, and also from and to dates for each row, which tells us when each row is "valid". Each row can maximum be valid for a year.



      test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
      Payment=c(110,110,115,240,255),
      from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
      to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
      )


      For the rows that pass from one year to another, I want to split up the rows, such that I end up with two rows instead of the original row, and also manipulate the from_date and to_date, such that I end up with a new dataset looking like this:



        test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
      Payment=c(110,110,110,110,115,115,240,255,255),
      from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
      to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
      )


      Attempt:



      library(lubridate) #for function "year" below
      test_desired=test
      row=c()
      tmp=c()
      for(i in 1:nrow(test_desired)){
      if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
      test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
      row = test_desired[i,]
      row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
      row$to_date = test$to_date[i]
      tmp=rbind(tmp,row)

      } else next
      }
      test_desired=rbind(test_desired,tmp)
      library(dplyr)
      test_desired=arrange(test_desired,ID,from_date)


      Is there a more elegant way of doing this, for example with dplyr?







      r






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited May 19 at 10:26







      Helen

















      asked May 19 at 10:11









      HelenHelen

      4793 silver badges17 bronze badges




      4793 silver badges17 bronze badges
























          5 Answers
          5






          active

          oldest

          votes


















          3














          Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



          library(dplyr)
          library(lubridate)
          library(purrr)

          test %>%
          ungroup() %>% # This isn't necessary if there are no groupings.
          split(rownames(test)) %>%
          map_dfr(function(df){
          if (year(df$from_date) == year(df$to_date)) return(df)
          bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
          mutate(df, from_date = floor_date(to_date, "y"))
          )
          }
          )

          #### OUTPUT ####

          ID Disease Pass Payment from_date to_date
          1 10 P US 110 2008-01-09 2008-12-31
          2 10 P US 110 2009-01-01 2009-01-08
          3 10 P US 110 2009-01-09 2009-12-31
          4 10 P US 110 2010-01-01 2010-01-08
          5 10 P US 115 2010-01-09 2010-12-31
          6 10 P US 115 2011-01-01 2011-01-08
          7 12 D EN 240 2008-01-01 2008-12-31
          8 12 P EN 255 2013-12-31 2013-12-31
          9 12 P EN 255 2014-01-01 2014-12-30


          To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.






          share|improve this answer

































            2














            Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



            cr_date <- function(d1, d2){
            #browser()
            sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
            lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
            function(x) paste0(min(x), '|', max(x)))
            result <- paste0(lst_dates, collapse = ';')
            return(result)
            }

            #Test
            #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
            test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

            library(tidyr)
            separate_rows(test, flag, sep=';') %>%
            separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
            mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


            ID Disease Pass Payment from_date to_date from_date_new to_date_new
            1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
            2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
            3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
            4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
            5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
            6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
            7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
            8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
            9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30





            share|improve this answer


























            • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

              – Helen
              May 19 at 12:20











            • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

              – Helen
              May 19 at 12:28











            • So, one last question, this does not mantain the date format for the new date columns, correct?

              – Helen
              May 19 at 12:54






            • 1





              @Erosennin yes that is correct. See my update which handles this issue.

              – A. Suliman
              May 19 at 13:35



















            2














            This uses only base R.



            First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



            Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



            test3 <- transform(test2, 
            from_year = format(from_date, "%Y"),
            to_year = format(to_date, "%Y"),
            eoy = as.Date(sub("-.*", "-12-31", from_date)))

            nr <- nrow(test2)
            do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
            if (from_year == to_year) test2[i, ]
            else data.frame(ID, Disease, Pass, Payment,
            from_date = c(from_date, eoy+1),
            to_date = c(eoy, to_date)))
            ))


            Note



            Assumed input in reproducible form. As noted above it uses Date class.



            test2 <- transform(test, 
            from_date = as.Date(from_date),
            to_date = as.Date(to_date))





            share|improve this answer































              1














              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



              test %>% 
              rbind(test) %>%
              arrange(ID, from_date) %>%
              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
              filter(from_date - to_date != 1)

              ID Disease Pass Payment from_date to_date
              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0






              share|improve this answer


























              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                – Helen
                May 19 at 12:30











              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                – Lennyy
                May 19 at 12:56











              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                – Helen
                May 19 at 13:16






              • 1





                @Erosennin just add ungroup() %>% below test %>%

                – gersht
                May 19 at 13:31



















              1














              I am just using a data.table which also provides a yearfunction
              and ignore the possibly slow date conversion logic with as.POSIXct.



              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



              library(data.table)  # also provides a "year" function

              setDT(test)

              # Create additional rows for the new year
              additional_rows <- test[year(from_date) < year(to_date), ]
              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

              # Shorten the "from_date" of the affected original rows
              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

              # Create a combined data table as result
              result <- rbind(test, additional_rows)
              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


              which results in



              > result
              ID Disease Pass Payment from_date to_date
              1: 10 P US 110 2008-01-09 2008-12-31
              2: 10 P US 110 2009-01-01 2009-01-08
              3: 10 P US 110 2009-01-09 2009-12-31
              4: 10 P US 110 2010-01-01 2010-01-08
              5: 10 P US 115 2010-01-09 2010-12-31
              6: 10 P US 115 2011-01-01 2011-01-08
              7: 12 D EN 240 2008-01-01 2008-12-31
              8: 12 P EN 255 2013-12-31 2013-12-31
              9: 12 P EN 255 2014-01-01 2014-12-30





              share|improve this answer


























              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                – Helen
                May 19 at 12:25











              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                – R Yoda
                May 19 at 14:43














              Your Answer






              StackExchange.ifUsing("editor", function () {
              StackExchange.using("externalEditor", function () {
              StackExchange.using("snippets", function () {
              StackExchange.snippets.init();
              });
              });
              }, "code-snippets");

              StackExchange.ready(function() {
              var channelOptions = {
              tags: "".split(" "),
              id: "1"
              };
              initTagRenderer("".split(" "), "".split(" "), channelOptions);

              StackExchange.using("externalEditor", function() {
              // Have to fire editor after snippets, if snippets enabled
              if (StackExchange.settings.snippets.snippetsEnabled) {
              StackExchange.using("snippets", function() {
              createEditor();
              });
              }
              else {
              createEditor();
              }
              });

              function createEditor() {
              StackExchange.prepareEditor({
              heartbeatType: 'answer',
              autoActivateHeartbeat: false,
              convertImagesToLinks: true,
              noModals: true,
              showLowRepImageUploadWarning: true,
              reputationToPostImages: 10,
              bindNavPrevention: true,
              postfix: "",
              imageUploader: {
              brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
              contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
              allowUrls: true
              },
              onDemand: true,
              discardSelector: ".discard-answer"
              ,immediatelyShowMarkdownHelp:true
              });


              }
              });














              draft saved

              draft discarded


















              StackExchange.ready(
              function () {
              StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f56206794%2fsplit-date-rows-by-new-year%23new-answer', 'question_page');
              }
              );

              Post as a guest















              Required, but never shown

























              5 Answers
              5






              active

              oldest

              votes








              5 Answers
              5






              active

              oldest

              votes









              active

              oldest

              votes






              active

              oldest

              votes









              3














              Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



              library(dplyr)
              library(lubridate)
              library(purrr)

              test %>%
              ungroup() %>% # This isn't necessary if there are no groupings.
              split(rownames(test)) %>%
              map_dfr(function(df){
              if (year(df$from_date) == year(df$to_date)) return(df)
              bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
              mutate(df, from_date = floor_date(to_date, "y"))
              )
              }
              )

              #### OUTPUT ####

              ID Disease Pass Payment from_date to_date
              1 10 P US 110 2008-01-09 2008-12-31
              2 10 P US 110 2009-01-01 2009-01-08
              3 10 P US 110 2009-01-09 2009-12-31
              4 10 P US 110 2010-01-01 2010-01-08
              5 10 P US 115 2010-01-09 2010-12-31
              6 10 P US 115 2011-01-01 2011-01-08
              7 12 D EN 240 2008-01-01 2008-12-31
              8 12 P EN 255 2013-12-31 2013-12-31
              9 12 P EN 255 2014-01-01 2014-12-30


              To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.






              share|improve this answer






























                3














                Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



                library(dplyr)
                library(lubridate)
                library(purrr)

                test %>%
                ungroup() %>% # This isn't necessary if there are no groupings.
                split(rownames(test)) %>%
                map_dfr(function(df){
                if (year(df$from_date) == year(df$to_date)) return(df)
                bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
                mutate(df, from_date = floor_date(to_date, "y"))
                )
                }
                )

                #### OUTPUT ####

                ID Disease Pass Payment from_date to_date
                1 10 P US 110 2008-01-09 2008-12-31
                2 10 P US 110 2009-01-01 2009-01-08
                3 10 P US 110 2009-01-09 2009-12-31
                4 10 P US 110 2010-01-01 2010-01-08
                5 10 P US 115 2010-01-09 2010-12-31
                6 10 P US 115 2011-01-01 2011-01-08
                7 12 D EN 240 2008-01-01 2008-12-31
                8 12 P EN 255 2013-12-31 2013-12-31
                9 12 P EN 255 2014-01-01 2014-12-30


                To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.






                share|improve this answer




























                  3












                  3








                  3







                  Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



                  library(dplyr)
                  library(lubridate)
                  library(purrr)

                  test %>%
                  ungroup() %>% # This isn't necessary if there are no groupings.
                  split(rownames(test)) %>%
                  map_dfr(function(df){
                  if (year(df$from_date) == year(df$to_date)) return(df)
                  bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
                  mutate(df, from_date = floor_date(to_date, "y"))
                  )
                  }
                  )

                  #### OUTPUT ####

                  ID Disease Pass Payment from_date to_date
                  1 10 P US 110 2008-01-09 2008-12-31
                  2 10 P US 110 2009-01-01 2009-01-08
                  3 10 P US 110 2009-01-09 2009-12-31
                  4 10 P US 110 2010-01-01 2010-01-08
                  5 10 P US 115 2010-01-09 2010-12-31
                  6 10 P US 115 2011-01-01 2011-01-08
                  7 12 D EN 240 2008-01-01 2008-12-31
                  8 12 P EN 255 2013-12-31 2013-12-31
                  9 12 P EN 255 2014-01-01 2014-12-30


                  To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.






                  share|improve this answer















                  Here's a tidyverse based solution. It's similar to Lennyy's, but with fewer condition checks, and there's no issue with times being added (they might show up in a tibble, but as 00:00:00). I've added ungroup() because it sounds like you have a grouping variable somewhere (comment under Lennyy's solution). It can be removed if you don't:



                  library(dplyr)
                  library(lubridate)
                  library(purrr)

                  test %>%
                  ungroup() %>% # This isn't necessary if there are no groupings.
                  split(rownames(test)) %>%
                  map_dfr(function(df){
                  if (year(df$from_date) == year(df$to_date)) return(df)
                  bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
                  mutate(df, from_date = floor_date(to_date, "y"))
                  )
                  }
                  )

                  #### OUTPUT ####

                  ID Disease Pass Payment from_date to_date
                  1 10 P US 110 2008-01-09 2008-12-31
                  2 10 P US 110 2009-01-01 2009-01-08
                  3 10 P US 110 2009-01-09 2009-12-31
                  4 10 P US 110 2010-01-01 2010-01-08
                  5 10 P US 115 2010-01-09 2010-12-31
                  6 10 P US 115 2011-01-01 2011-01-08
                  7 12 D EN 240 2008-01-01 2008-12-31
                  8 12 P EN 255 2013-12-31 2013-12-31
                  9 12 P EN 255 2014-01-01 2014-12-30


                  To explain: The dataframe is split into a list of rows. I then use map_dfr to run the function on each dataframe where from_date and to_date contain different years. map_dfr also binds the resulting dataframes together. Within the anonymous function I floor to_date by year, and then I either roll it back to the last day of the previous month for the new to_date in the first row, or leave it as it is for the new from_date in the second row.







                  share|improve this answer














                  share|improve this answer



                  share|improve this answer








                  edited May 19 at 13:37

























                  answered May 19 at 13:13









                  gershtgersht

                  2,1961 gold badge2 silver badges11 bronze badges




                  2,1961 gold badge2 silver badges11 bronze badges

























                      2














                      Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



                      cr_date <- function(d1, d2){
                      #browser()
                      sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
                      lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                      function(x) paste0(min(x), '|', max(x)))
                      result <- paste0(lst_dates, collapse = ';')
                      return(result)
                      }

                      #Test
                      #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
                      test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

                      library(tidyr)
                      separate_rows(test, flag, sep=';') %>%
                      separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
                      mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


                      ID Disease Pass Payment from_date to_date from_date_new to_date_new
                      1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
                      2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
                      3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
                      4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
                      5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
                      6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
                      7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
                      8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
                      9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30





                      share|improve this answer


























                      • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                        – Helen
                        May 19 at 12:20











                      • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                        – Helen
                        May 19 at 12:28











                      • So, one last question, this does not mantain the date format for the new date columns, correct?

                        – Helen
                        May 19 at 12:54






                      • 1





                        @Erosennin yes that is correct. See my update which handles this issue.

                        – A. Suliman
                        May 19 at 13:35
















                      2














                      Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



                      cr_date <- function(d1, d2){
                      #browser()
                      sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
                      lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                      function(x) paste0(min(x), '|', max(x)))
                      result <- paste0(lst_dates, collapse = ';')
                      return(result)
                      }

                      #Test
                      #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
                      test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

                      library(tidyr)
                      separate_rows(test, flag, sep=';') %>%
                      separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
                      mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


                      ID Disease Pass Payment from_date to_date from_date_new to_date_new
                      1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
                      2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
                      3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
                      4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
                      5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
                      6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
                      7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
                      8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
                      9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30





                      share|improve this answer


























                      • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                        – Helen
                        May 19 at 12:20











                      • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                        – Helen
                        May 19 at 12:28











                      • So, one last question, this does not mantain the date format for the new date columns, correct?

                        – Helen
                        May 19 at 12:54






                      • 1





                        @Erosennin yes that is correct. See my update which handles this issue.

                        – A. Suliman
                        May 19 at 13:35














                      2












                      2








                      2







                      Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



                      cr_date <- function(d1, d2){
                      #browser()
                      sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
                      lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                      function(x) paste0(min(x), '|', max(x)))
                      result <- paste0(lst_dates, collapse = ';')
                      return(result)
                      }

                      #Test
                      #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
                      test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

                      library(tidyr)
                      separate_rows(test, flag, sep=';') %>%
                      separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
                      mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


                      ID Disease Pass Payment from_date to_date from_date_new to_date_new
                      1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
                      2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
                      3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
                      4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
                      5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
                      6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
                      7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
                      8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
                      9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30





                      share|improve this answer















                      Using from_date and to_date we can create a date sequence using seq.Date then split this sequence by year, finally select min and max of each year. Then use apply, separate_rows and separate to get the final result.



                      cr_date <- function(d1, d2){
                      #browser()
                      sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
                      lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                      function(x) paste0(min(x), '|', max(x)))
                      result <- paste0(lst_dates, collapse = ';')
                      return(result)
                      }

                      #Test
                      #cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
                      test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

                      library(tidyr)
                      separate_rows(test, flag, sep=';') %>%
                      separate(flag, into = c('from_date_new','to_date_new'), '\|') %>%
                      mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


                      ID Disease Pass Payment from_date to_date from_date_new to_date_new
                      1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
                      2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
                      3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
                      4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
                      5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
                      6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
                      7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
                      8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
                      9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30






                      share|improve this answer














                      share|improve this answer



                      share|improve this answer








                      edited May 19 at 13:33

























                      answered May 19 at 10:49









                      A. SulimanA. Suliman

                      7,0634 gold badges13 silver badges26 bronze badges




                      7,0634 gold badges13 silver badges26 bronze badges













                      • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                        – Helen
                        May 19 at 12:20











                      • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                        – Helen
                        May 19 at 12:28











                      • So, one last question, this does not mantain the date format for the new date columns, correct?

                        – Helen
                        May 19 at 12:54






                      • 1





                        @Erosennin yes that is correct. See my update which handles this issue.

                        – A. Suliman
                        May 19 at 13:35



















                      • This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                        – Helen
                        May 19 at 12:20











                      • Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                        – Helen
                        May 19 at 12:28











                      • So, one last question, this does not mantain the date format for the new date columns, correct?

                        – Helen
                        May 19 at 12:54






                      • 1





                        @Erosennin yes that is correct. See my update which handles this issue.

                        – A. Suliman
                        May 19 at 13:35

















                      This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                      – Helen
                      May 19 at 12:20





                      This gives me an error, and to_dates becomes NA: Warning message: Expected 2 pieces. Missing pieces filled with NA in 3547 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

                      – Helen
                      May 19 at 12:20













                      Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                      – Helen
                      May 19 at 12:28





                      Yes, it's produced by separate. My dataset is quite big, is is something in particular you are looking for?

                      – Helen
                      May 19 at 12:28













                      So, one last question, this does not mantain the date format for the new date columns, correct?

                      – Helen
                      May 19 at 12:54





                      So, one last question, this does not mantain the date format for the new date columns, correct?

                      – Helen
                      May 19 at 12:54




                      1




                      1





                      @Erosennin yes that is correct. See my update which handles this issue.

                      – A. Suliman
                      May 19 at 13:35





                      @Erosennin yes that is correct. See my update which handles this issue.

                      – A. Suliman
                      May 19 at 13:35











                      2














                      This uses only base R.



                      First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



                      Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



                      test3 <- transform(test2, 
                      from_year = format(from_date, "%Y"),
                      to_year = format(to_date, "%Y"),
                      eoy = as.Date(sub("-.*", "-12-31", from_date)))

                      nr <- nrow(test2)
                      do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
                      if (from_year == to_year) test2[i, ]
                      else data.frame(ID, Disease, Pass, Payment,
                      from_date = c(from_date, eoy+1),
                      to_date = c(eoy, to_date)))
                      ))


                      Note



                      Assumed input in reproducible form. As noted above it uses Date class.



                      test2 <- transform(test, 
                      from_date = as.Date(from_date),
                      to_date = as.Date(to_date))





                      share|improve this answer




























                        2














                        This uses only base R.



                        First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



                        Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



                        test3 <- transform(test2, 
                        from_year = format(from_date, "%Y"),
                        to_year = format(to_date, "%Y"),
                        eoy = as.Date(sub("-.*", "-12-31", from_date)))

                        nr <- nrow(test2)
                        do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
                        if (from_year == to_year) test2[i, ]
                        else data.frame(ID, Disease, Pass, Payment,
                        from_date = c(from_date, eoy+1),
                        to_date = c(eoy, to_date)))
                        ))


                        Note



                        Assumed input in reproducible form. As noted above it uses Date class.



                        test2 <- transform(test, 
                        from_date = as.Date(from_date),
                        to_date = as.Date(to_date))





                        share|improve this answer


























                          2












                          2








                          2







                          This uses only base R.



                          First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



                          Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



                          test3 <- transform(test2, 
                          from_year = format(from_date, "%Y"),
                          to_year = format(to_date, "%Y"),
                          eoy = as.Date(sub("-.*", "-12-31", from_date)))

                          nr <- nrow(test2)
                          do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
                          if (from_year == to_year) test2[i, ]
                          else data.frame(ID, Disease, Pass, Payment,
                          from_date = c(from_date, eoy+1),
                          to_date = c(eoy, to_date)))
                          ))


                          Note



                          Assumed input in reproducible form. As noted above it uses Date class.



                          test2 <- transform(test, 
                          from_date = as.Date(from_date),
                          to_date = as.Date(to_date))





                          share|improve this answer













                          This uses only base R.



                          First note that only dates with no times are used so we should be using Date class, not POSIXct. The latter can needlessly introduce timezone errors unless you are very careful so in the Note at the end which shows the input used we assume that we are starting out with test2 which contains Date class data. The code in the Note also shows how to convert it to Date class if it it already POSIXct.



                          Given test2 we add from_year, to_year and eoy (date at the end of the year) columns giving test3. Then we iterate over the rows and if the years are the same return the row and if not return the split rows. This gives a list of one and two row data frames which we rbind together.



                          test3 <- transform(test2, 
                          from_year = format(from_date, "%Y"),
                          to_year = format(to_date, "%Y"),
                          eoy = as.Date(sub("-.*", "-12-31", from_date)))

                          nr <- nrow(test2)
                          do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
                          if (from_year == to_year) test2[i, ]
                          else data.frame(ID, Disease, Pass, Payment,
                          from_date = c(from_date, eoy+1),
                          to_date = c(eoy, to_date)))
                          ))


                          Note



                          Assumed input in reproducible form. As noted above it uses Date class.



                          test2 <- transform(test, 
                          from_date = as.Date(from_date),
                          to_date = as.Date(to_date))






                          share|improve this answer












                          share|improve this answer



                          share|improve this answer










                          answered May 19 at 17:09









                          G. GrothendieckG. Grothendieck

                          161k11 gold badges143 silver badges252 bronze badges




                          161k11 gold badges143 silver badges252 bronze badges























                              1














                              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



                              test %>% 
                              rbind(test) %>%
                              arrange(ID, from_date) %>%
                              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
                              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
                              filter(from_date - to_date != 1)

                              ID Disease Pass Payment from_date to_date
                              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
                              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
                              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
                              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
                              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
                              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
                              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
                              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
                              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


                              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0






                              share|improve this answer


























                              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                                – Helen
                                May 19 at 12:30











                              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                                – Lennyy
                                May 19 at 12:56











                              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                                – Helen
                                May 19 at 13:16






                              • 1





                                @Erosennin just add ungroup() %>% below test %>%

                                – gersht
                                May 19 at 13:31
















                              1














                              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



                              test %>% 
                              rbind(test) %>%
                              arrange(ID, from_date) %>%
                              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
                              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
                              filter(from_date - to_date != 1)

                              ID Disease Pass Payment from_date to_date
                              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
                              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
                              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
                              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
                              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
                              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
                              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
                              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
                              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


                              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0






                              share|improve this answer


























                              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                                – Helen
                                May 19 at 12:30











                              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                                – Lennyy
                                May 19 at 12:56











                              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                                – Helen
                                May 19 at 13:16






                              • 1





                                @Erosennin just add ungroup() %>% below test %>%

                                – gersht
                                May 19 at 13:31














                              1












                              1








                              1







                              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



                              test %>% 
                              rbind(test) %>%
                              arrange(ID, from_date) %>%
                              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
                              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
                              filter(from_date - to_date != 1)

                              ID Disease Pass Payment from_date to_date
                              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
                              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
                              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
                              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
                              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
                              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
                              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
                              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
                              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


                              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0






                              share|improve this answer















                              You could as well try something like below using dplyr and lubridate. It works as following: 1. Duplicate the dataframe using rbind. 2. Arrange at first on ID, secondly on from_date and third on the order of rows given in test. 3. in the even rows, change from_date to the first day of the new year. 4. In the odd rows, change to_date to the last day of the previous year. 5. Finally, exclude the rows in which the difference between from_date and to_date is only 1 second.



                              test %>% 
                              rbind(test) %>%
                              arrange(ID, from_date) %>%
                              mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
                              to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>%
                              filter(from_date - to_date != 1)

                              ID Disease Pass Payment from_date to_date
                              1 10 P US 110 2008-01-09 00:00:00 2008-12-31 23:59:59
                              2 10 P US 110 2009-01-01 00:00:01 2009-01-08 00:00:00
                              3 10 P US 110 2009-01-09 00:00:00 2009-12-31 23:59:59
                              4 10 P US 110 2010-01-01 00:00:01 2010-01-08 00:00:00
                              5 10 P US 115 2010-01-09 00:00:00 2010-12-31 23:59:59
                              6 10 P US 115 2011-01-01 00:00:01 2011-01-08 00:00:00
                              7 12 D EN 240 2008-01-01 00:00:01 2008-12-31 00:00:00
                              8 12 P EN 255 2013-12-31 00:00:00 2013-12-31 23:59:59
                              9 12 P EN 255 2014-01-01 00:00:01 2014-12-30 00:00:00


                              Only downside might be that times are added, but you could of course delete those. And in case a period might continue in a third year, you could use the same logic but with a second rbind and row_number() %% 3 == 0







                              share|improve this answer














                              share|improve this answer



                              share|improve this answer








                              edited May 19 at 12:11

























                              answered May 19 at 11:58









                              LennyyLennyy

                              3,8452 gold badges4 silver badges17 bronze badges




                              3,8452 gold badges4 silver badges17 bronze badges













                              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                                – Helen
                                May 19 at 12:30











                              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                                – Lennyy
                                May 19 at 12:56











                              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                                – Helen
                                May 19 at 13:16






                              • 1





                                @Erosennin just add ungroup() %>% below test %>%

                                – gersht
                                May 19 at 13:31



















                              • I get an error: Error: Column from_date can't be modified because it's a grouping variable

                                – Helen
                                May 19 at 12:30











                              • When you load test from your OP, there is no grouping variable. Else run ungroup first

                                – Lennyy
                                May 19 at 12:56











                              • Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                                – Helen
                                May 19 at 13:16






                              • 1





                                @Erosennin just add ungroup() %>% below test %>%

                                – gersht
                                May 19 at 13:31

















                              I get an error: Error: Column from_date can't be modified because it's a grouping variable

                              – Helen
                              May 19 at 12:30





                              I get an error: Error: Column from_date can't be modified because it's a grouping variable

                              – Helen
                              May 19 at 12:30













                              When you load test from your OP, there is no grouping variable. Else run ungroup first

                              – Lennyy
                              May 19 at 12:56





                              When you load test from your OP, there is no grouping variable. Else run ungroup first

                              – Lennyy
                              May 19 at 12:56













                              Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                              – Helen
                              May 19 at 13:16





                              Oh, I think my lack of knowledge of dplyr is the issue here, sorry! How do I ungroup?

                              – Helen
                              May 19 at 13:16




                              1




                              1





                              @Erosennin just add ungroup() %>% below test %>%

                              – gersht
                              May 19 at 13:31





                              @Erosennin just add ungroup() %>% below test %>%

                              – gersht
                              May 19 at 13:31











                              1














                              I am just using a data.table which also provides a yearfunction
                              and ignore the possibly slow date conversion logic with as.POSIXct.



                              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



                              library(data.table)  # also provides a "year" function

                              setDT(test)

                              # Create additional rows for the new year
                              additional_rows <- test[year(from_date) < year(to_date), ]
                              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

                              # Shorten the "from_date" of the affected original rows
                              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

                              # Create a combined data table as result
                              result <- rbind(test, additional_rows)
                              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


                              which results in



                              > result
                              ID Disease Pass Payment from_date to_date
                              1: 10 P US 110 2008-01-09 2008-12-31
                              2: 10 P US 110 2009-01-01 2009-01-08
                              3: 10 P US 110 2009-01-09 2009-12-31
                              4: 10 P US 110 2010-01-01 2010-01-08
                              5: 10 P US 115 2010-01-09 2010-12-31
                              6: 10 P US 115 2011-01-01 2011-01-08
                              7: 12 D EN 240 2008-01-01 2008-12-31
                              8: 12 P EN 255 2013-12-31 2013-12-31
                              9: 12 P EN 255 2014-01-01 2014-12-30





                              share|improve this answer


























                              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                                – Helen
                                May 19 at 12:25











                              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                                – R Yoda
                                May 19 at 14:43
















                              1














                              I am just using a data.table which also provides a yearfunction
                              and ignore the possibly slow date conversion logic with as.POSIXct.



                              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



                              library(data.table)  # also provides a "year" function

                              setDT(test)

                              # Create additional rows for the new year
                              additional_rows <- test[year(from_date) < year(to_date), ]
                              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

                              # Shorten the "from_date" of the affected original rows
                              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

                              # Create a combined data table as result
                              result <- rbind(test, additional_rows)
                              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


                              which results in



                              > result
                              ID Disease Pass Payment from_date to_date
                              1: 10 P US 110 2008-01-09 2008-12-31
                              2: 10 P US 110 2009-01-01 2009-01-08
                              3: 10 P US 110 2009-01-09 2009-12-31
                              4: 10 P US 110 2010-01-01 2010-01-08
                              5: 10 P US 115 2010-01-09 2010-12-31
                              6: 10 P US 115 2011-01-01 2011-01-08
                              7: 12 D EN 240 2008-01-01 2008-12-31
                              8: 12 P EN 255 2013-12-31 2013-12-31
                              9: 12 P EN 255 2014-01-01 2014-12-30





                              share|improve this answer


























                              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                                – Helen
                                May 19 at 12:25











                              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                                – R Yoda
                                May 19 at 14:43














                              1












                              1








                              1







                              I am just using a data.table which also provides a yearfunction
                              and ignore the possibly slow date conversion logic with as.POSIXct.



                              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



                              library(data.table)  # also provides a "year" function

                              setDT(test)

                              # Create additional rows for the new year
                              additional_rows <- test[year(from_date) < year(to_date), ]
                              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

                              # Shorten the "from_date" of the affected original rows
                              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

                              # Create a combined data table as result
                              result <- rbind(test, additional_rows)
                              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


                              which results in



                              > result
                              ID Disease Pass Payment from_date to_date
                              1: 10 P US 110 2008-01-09 2008-12-31
                              2: 10 P US 110 2009-01-01 2009-01-08
                              3: 10 P US 110 2009-01-09 2009-12-31
                              4: 10 P US 110 2010-01-01 2010-01-08
                              5: 10 P US 115 2010-01-09 2010-12-31
                              6: 10 P US 115 2011-01-01 2011-01-08
                              7: 12 D EN 240 2008-01-01 2008-12-31
                              8: 12 P EN 255 2013-12-31 2013-12-31
                              9: 12 P EN 255 2014-01-01 2014-12-30





                              share|improve this answer















                              I am just using a data.table which also provides a yearfunction
                              and ignore the possibly slow date conversion logic with as.POSIXct.



                              I am also assuming that the to_date and from_date may differ by one year only (not more than one year!).



                              library(data.table)  # also provides a "year" function

                              setDT(test)

                              # Create additional rows for the new year
                              additional_rows <- test[year(from_date) < year(to_date), ]
                              additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

                              # Shorten the "from_date" of the affected original rows
                              test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

                              # Create a combined data table as result
                              result <- rbind(test, additional_rows)
                              setkey(result, ID, Payment, from_date) # just to sort the data like the "test_desired" sort order


                              which results in



                              > result
                              ID Disease Pass Payment from_date to_date
                              1: 10 P US 110 2008-01-09 2008-12-31
                              2: 10 P US 110 2009-01-01 2009-01-08
                              3: 10 P US 110 2009-01-09 2009-12-31
                              4: 10 P US 110 2010-01-01 2010-01-08
                              5: 10 P US 115 2010-01-09 2010-12-31
                              6: 10 P US 115 2011-01-01 2011-01-08
                              7: 12 D EN 240 2008-01-01 2008-12-31
                              8: 12 P EN 255 2013-12-31 2013-12-31
                              9: 12 P EN 255 2014-01-01 2014-12-30






                              share|improve this answer














                              share|improve this answer



                              share|improve this answer








                              edited May 20 at 6:14

























                              answered May 19 at 10:34









                              R YodaR Yoda

                              4,48820 silver badges49 bronze badges




                              4,48820 silver badges49 bronze badges













                              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                                – Helen
                                May 19 at 12:25











                              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                                – R Yoda
                                May 19 at 14:43



















                              • I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                                – Helen
                                May 19 at 12:25











                              • What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                                – R Yoda
                                May 19 at 14:43

















                              I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                              – Helen
                              May 19 at 12:25





                              I'm having difficulties testing this solution as I'm using dplyr and lubridate and loading data.table masks some functions that I am already using.

                              – Helen
                              May 19 at 12:25













                              What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                              – R Yoda
                              May 19 at 14:43





                              What kind of difficulties (symptoms)? An "easy" solution is to modify the order of library statements (the package loaded first wins until you specify a function name with the package name, eg. data.table::year. So: Try to put library(data.table) at the end of all other library statements and it should work...

                              – R Yoda
                              May 19 at 14:43


















                              draft saved

                              draft discarded




















































                              Thanks for contributing an answer to Stack Overflow!


                              • Please be sure to answer the question. Provide details and share your research!

                              But avoid



                              • Asking for help, clarification, or responding to other answers.

                              • Making statements based on opinion; back them up with references or personal experience.


                              To learn more, see our tips on writing great answers.




                              draft saved


                              draft discarded














                              StackExchange.ready(
                              function () {
                              StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f56206794%2fsplit-date-rows-by-new-year%23new-answer', 'question_page');
                              }
                              );

                              Post as a guest















                              Required, but never shown





















































                              Required, but never shown














                              Required, but never shown












                              Required, but never shown







                              Required, but never shown

































                              Required, but never shown














                              Required, but never shown












                              Required, but never shown







                              Required, but never shown







                              Popular posts from this blog

                              Bruad Bilen | Luke uk diar | NawigatsjuunCommonskategorii: BruadCommonskategorii: RunstükenWikiquote: Bruad

                              Færeyskur hestur Heimild | Tengill | Tilvísanir | LeiðsagnarvalRossið - síða um færeyska hrossið á færeyskuGott ár hjá færeyska hestinum

                              He _____ here since 1970 . Answer needed [closed]What does “since he was so high” mean?Meaning of “catch birds for”?How do I ensure “since” takes the meaning I want?“Who cares here” meaningWhat does “right round toward” mean?the time tense (had now been detected)What does the phrase “ring around the roses” mean here?Correct usage of “visited upon”Meaning of “foiled rail sabotage bid”It was the third time I had gone to Rome or It is the third time I had been to Rome