From feeee046010ac53e1f6baf6aa1a696a26c7c74d7 Mon Sep 17 00:00:00 2001 From: Martin Hilton Date: Wed, 31 Jan 2024 11:01:26 +0000 Subject: [PATCH] fix: unambiguously truncate time in date_trunc function (#9068) * fix: unambiguously truncate time in date_trunc function When date_trunc is truncating a timestamp with a geographic timezone it would previously get stuck if the local reprentation of the time could be ambiguously interpretted. This happens when the clocks "go back". The update here is to use the original timestamp offset as the tie-breaker when the local representation of the truncated time could be ambiguous. * fix: fmt * chore: review suggestion * fix: fmt * Add date_trunc test to slt * Update test for fixed fode * feat: additional tests Add test for the historical America/Sao_Paulo timezone which changed in and out of DST at midnight. * chore: review suggestion * fix: date_trunc support days starting at 1am Historically Sao Paulo, and possibly other places, have had daylight savings time that started at midnight. This causes the day to start at 1am. The naive method used by date_trunc to truncate to 'day' will create a non-existent time in these circumstances. Adjust the timestamps produced by date_trunc in this case to be valid within the required timezone. * fix: fmt --------- Co-authored-by: Andrew Lamb --- .../physical-expr/src/datetime_expressions.rs | 236 +++++++++++++++++- .../sqllogictest/test_files/timestamps.slt | 167 +++++++++++++ 2 files changed, 400 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index 430220faf815..c40a89b0ba91 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -17,6 +17,7 @@ //! DateTime expressions +use std::ops::{Add, Sub}; use std::str::FromStr; use std::sync::Arc; @@ -43,7 +44,7 @@ use arrow_array::types::{ArrowTimestampType, Date32Type, Int32Type}; use arrow_array::GenericStringArray; use chrono::prelude::*; use chrono::LocalResult::Single; -use chrono::{Duration, Months, NaiveDate}; +use chrono::{Duration, LocalResult, Months, NaiveDate}; use itertools::Either; use datafusion_common::cast::{ @@ -662,8 +663,42 @@ fn _date_trunc_coarse_with_tz( granularity: &str, value: Option>, ) -> Result> { - let value = _date_trunc_coarse::>(granularity, value)?; - Ok(value.and_then(|value| value.timestamp_nanos_opt())) + if let Some(value) = value { + let local = value.naive_local(); + let truncated = _date_trunc_coarse::(granularity, Some(local))?; + let truncated = truncated.and_then(|truncated| { + match truncated.and_local_timezone(value.timezone()) { + LocalResult::None => { + // This can happen if the date_trunc operation moves the time into + // an hour that doesn't exist due to daylight savings. On known example where + // this can happen is with historic dates in the America/Sao_Paulo time zone. + // To account for this adjust the time by a few hours, convert to local time, + // and then adjust the time back. + truncated + .sub(Duration::hours(3)) + .and_local_timezone(value.timezone()) + .single() + .map(|v| v.add(Duration::hours(3))) + } + LocalResult::Single(datetime) => Some(datetime), + LocalResult::Ambiguous(datetime1, datetime2) => { + // Because we are truncating from an equally or more specific time + // the original time must have been within the ambiguous local time + // period. Therefore the offset of one of these times should match the + // offset of the original time. + if datetime1.offset().fix() == value.offset().fix() { + Some(datetime1) + } else { + Some(datetime2) + } + } + } + }); + Ok(truncated.and_then(|value| value.timestamp_nanos_opt())) + } else { + _date_trunc_coarse::(granularity, None)?; + Ok(None) + } } fn _date_trunc_coarse_without_tz( @@ -1784,6 +1819,44 @@ mod tests { "2020-09-08T00:00:00+08", ], ), + ( + vec![ + "2024-10-26T23:00:00Z", + "2024-10-27T00:00:00Z", + "2024-10-27T01:00:00Z", + "2024-10-27T02:00:00Z", + ], + Some("Europe/Berlin".into()), + vec![ + "2024-10-27T00:00:00+02", + "2024-10-27T00:00:00+02", + "2024-10-27T00:00:00+02", + "2024-10-27T00:00:00+02", + ], + ), + ( + vec![ + "2018-02-18T00:00:00Z", + "2018-02-18T01:00:00Z", + "2018-02-18T02:00:00Z", + "2018-02-18T03:00:00Z", + "2018-11-04T01:00:00Z", + "2018-11-04T02:00:00Z", + "2018-11-04T03:00:00Z", + "2018-11-04T04:00:00Z", + ], + Some("America/Sao_Paulo".into()), + vec![ + "2018-02-17T00:00:00-02", + "2018-02-17T00:00:00-02", + "2018-02-17T00:00:00-02", + "2018-02-18T00:00:00-03", + "2018-11-03T00:00:00-03", + "2018-11-03T00:00:00-03", + "2018-11-04T01:00:00-02", + "2018-11-04T01:00:00-02", + ], + ), ]; cases.iter().for_each(|(original, tz_opt, expected)| { @@ -1815,6 +1888,163 @@ mod tests { }); } + #[test] + fn test_date_trunc_hour_timezones() { + let cases = vec![ + ( + vec![ + "2020-09-08T00:30:00Z", + "2020-09-08T01:30:00Z", + "2020-09-08T02:30:00Z", + "2020-09-08T03:30:00Z", + "2020-09-08T04:30:00Z", + ], + Some("+00".into()), + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:30:00Z", + "2020-09-08T01:30:00Z", + "2020-09-08T02:30:00Z", + "2020-09-08T03:30:00Z", + "2020-09-08T04:30:00Z", + ], + None, + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:30:00Z", + "2020-09-08T01:30:00Z", + "2020-09-08T02:30:00Z", + "2020-09-08T03:30:00Z", + "2020-09-08T04:30:00Z", + ], + Some("-02".into()), + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:30:00+05", + "2020-09-08T01:30:00+05", + "2020-09-08T02:30:00+05", + "2020-09-08T03:30:00+05", + "2020-09-08T04:30:00+05", + ], + Some("+05".into()), + vec![ + "2020-09-08T00:00:00+05", + "2020-09-08T01:00:00+05", + "2020-09-08T02:00:00+05", + "2020-09-08T03:00:00+05", + "2020-09-08T04:00:00+05", + ], + ), + ( + vec![ + "2020-09-08T00:30:00+08", + "2020-09-08T01:30:00+08", + "2020-09-08T02:30:00+08", + "2020-09-08T03:30:00+08", + "2020-09-08T04:30:00+08", + ], + Some("+08".into()), + vec![ + "2020-09-08T00:00:00+08", + "2020-09-08T01:00:00+08", + "2020-09-08T02:00:00+08", + "2020-09-08T03:00:00+08", + "2020-09-08T04:00:00+08", + ], + ), + ( + vec![ + "2024-10-26T23:30:00Z", + "2024-10-27T00:30:00Z", + "2024-10-27T01:30:00Z", + "2024-10-27T02:30:00Z", + ], + Some("Europe/Berlin".into()), + vec![ + "2024-10-27T01:00:00+02", + "2024-10-27T02:00:00+02", + "2024-10-27T02:00:00+01", + "2024-10-27T03:00:00+01", + ], + ), + ( + vec![ + "2018-02-18T00:30:00Z", + "2018-02-18T01:30:00Z", + "2018-02-18T02:30:00Z", + "2018-02-18T03:30:00Z", + "2018-11-04T01:00:00Z", + "2018-11-04T02:00:00Z", + "2018-11-04T03:00:00Z", + "2018-11-04T04:00:00Z", + ], + Some("America/Sao_Paulo".into()), + vec![ + "2018-02-17T22:00:00-02", + "2018-02-17T23:00:00-02", + "2018-02-17T23:00:00-03", + "2018-02-18T00:00:00-03", + "2018-11-03T22:00:00-03", + "2018-11-03T23:00:00-03", + "2018-11-04T01:00:00-02", + "2018-11-04T02:00:00-02", + ], + ), + ]; + + cases.iter().for_each(|(original, tz_opt, expected)| { + let input = original + .iter() + .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) + .collect::() + .with_timezone_opt(tz_opt.clone()); + let right = expected + .iter() + .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) + .collect::() + .with_timezone_opt(tz_opt.clone()); + let result = date_trunc(&[ + ColumnarValue::Scalar(ScalarValue::from("hour")), + ColumnarValue::Array(Arc::new(input)), + ]) + .unwrap(); + if let ColumnarValue::Array(result) = result { + assert_eq!( + result.data_type(), + &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()) + ); + let left = as_primitive_array::(&result); + assert_eq!(left, &right); + } else { + panic!("unexpected column type"); + } + }); + } + #[test] fn test_date_bin_single() { use chrono::Duration; diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 8f565453a12f..980545e922c4 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -1262,6 +1262,173 @@ SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp 2023-08-03T14:38:50 2023-08-03T14:38:50 +# date_trunc with data with timezones +statement ok +CREATE TABLE timestamp_strings(ts varchar) +AS VALUES +('2024-10-27 00:00:00'), +('2024-10-27 00:30:00'), +('2024-10-27 01:30:00'), +('2024-10-27 02:00:00'), -- Daylight Savings Time happens here in Berlin +('2024-10-27 02:30:00'), +('2024-10-27 03:00:00'), +('2024-10-27 03:30:00') +; + +statement ok +create view timestamp_utc as +select + arrow_cast(ts, 'Timestamp(Nanosecond, Some("UTC"))') as ts +from timestamp_strings; + +statement ok +create view timestamp_berlin as +select + arrow_cast(ts, 'Timestamp(Nanosecond, Some("Europe/Berlin"))') as ts +from timestamp_utc; -- have to convert to utc prior to converting to berlin + +query PT +select ts, arrow_typeof(ts) from timestamp_utc order by ts; +---- +2024-10-27T00:00:00Z Timestamp(Nanosecond, Some("UTC")) +2024-10-27T00:30:00Z Timestamp(Nanosecond, Some("UTC")) +2024-10-27T01:30:00Z Timestamp(Nanosecond, Some("UTC")) +2024-10-27T02:00:00Z Timestamp(Nanosecond, Some("UTC")) +2024-10-27T02:30:00Z Timestamp(Nanosecond, Some("UTC")) +2024-10-27T03:00:00Z Timestamp(Nanosecond, Some("UTC")) +2024-10-27T03:30:00Z Timestamp(Nanosecond, Some("UTC")) + +query PT +select ts, arrow_typeof(ts) from timestamp_berlin order by ts; +---- +2024-10-27T02:00:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin")) +2024-10-27T02:30:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin")) +2024-10-27T02:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) +2024-10-27T03:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) +2024-10-27T03:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) +2024-10-27T04:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) +2024-10-27T04:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) + +# date trunc in utc with DST +query PPPP +select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour', ts) +from timestamp_utc order by ts; +---- +2024-10-27T00:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 2024-10-27T00:00:00Z +2024-10-27T00:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 2024-10-27T00:00:00Z +2024-10-27T01:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 2024-10-27T01:00:00Z +2024-10-27T02:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 2024-10-27T02:00:00Z +2024-10-27T02:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 2024-10-27T02:00:00Z +2024-10-27T03:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 2024-10-27T03:00:00Z +2024-10-27T03:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 2024-10-27T03:00:00Z + + +# date trunc in a timezone with DST across DST boundary (note the date-trunc hour value repeats) +# Test for https://github.com/apache/arrow-datafusion/issues/8899 +query PPPP +select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour', ts) +from timestamp_berlin order by ts; +---- +2024-10-27T02:00:00+02:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 2024-10-27T02:00:00+02:00 +2024-10-27T02:30:00+02:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 2024-10-27T02:00:00+02:00 +2024-10-27T02:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 2024-10-27T02:00:00+01:00 +2024-10-27T03:00:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 2024-10-27T03:00:00+01:00 +2024-10-27T03:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 2024-10-27T03:00:00+01:00 +2024-10-27T04:00:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 2024-10-27T04:00:00+01:00 +2024-10-27T04:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 2024-10-27T04:00:00+01:00 + +statement ok +drop table timestamp_strings; + +statement ok +drop view timestamp_utc; + +statement ok +drop view timestamp_berlin; + +# date_trunc with data with timezones where transition happens at midnight +statement ok +CREATE TABLE timestamp_strings(ts varchar) +AS VALUES +('2018-11-04 01:00:00'), +('2018-11-04 01:30:00'), +('2018-11-04 02:30:00'), +('2018-11-04 03:00:00'), -- Daylight Savings Time started here in Sao Paulo +('2018-11-04 03:30:00'), +('2018-11-04 04:00:00'), +('2018-11-04 04:30:00') +; + +statement ok +create view timestamp_utc as +select + arrow_cast(ts, 'Timestamp(Nanosecond, Some("UTC"))') as ts +from timestamp_strings; + +statement ok +create view timestamp_sao_paulo as +select + arrow_cast(ts, 'Timestamp(Nanosecond, Some("America/Sao_Paulo"))') as ts +from timestamp_utc; -- have to convert to utc prior to converting to Sau Paulo + +query PT +select ts, arrow_typeof(ts) from timestamp_utc order by ts; +---- +2018-11-04T01:00:00Z Timestamp(Nanosecond, Some("UTC")) +2018-11-04T01:30:00Z Timestamp(Nanosecond, Some("UTC")) +2018-11-04T02:30:00Z Timestamp(Nanosecond, Some("UTC")) +2018-11-04T03:00:00Z Timestamp(Nanosecond, Some("UTC")) +2018-11-04T03:30:00Z Timestamp(Nanosecond, Some("UTC")) +2018-11-04T04:00:00Z Timestamp(Nanosecond, Some("UTC")) +2018-11-04T04:30:00Z Timestamp(Nanosecond, Some("UTC")) + +query PT +select ts, arrow_typeof(ts) from timestamp_sao_paulo order by ts; +---- +2018-11-03T22:00:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) +2018-11-03T22:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) +2018-11-03T23:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) +2018-11-04T01:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) +2018-11-04T01:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) +2018-11-04T02:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) +2018-11-04T02:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) + +# date trunc in utc with DST +query PPPP +select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour', ts) +from timestamp_utc order by ts; +---- +2018-11-04T01:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 2018-11-04T01:00:00Z +2018-11-04T01:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 2018-11-04T01:00:00Z +2018-11-04T02:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 2018-11-04T02:00:00Z +2018-11-04T03:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 2018-11-04T03:00:00Z +2018-11-04T03:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 2018-11-04T03:00:00Z +2018-11-04T04:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 2018-11-04T04:00:00Z +2018-11-04T04:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 2018-11-04T04:00:00Z + + +# date trunc in a timezone with DST across DST boundary (note there is no midnight on 2018-11-04) +# Test for https://github.com/apache/arrow-datafusion/issues/8899 +query PPPP +select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour', ts) +from timestamp_sao_paulo order by ts; +---- +2018-11-03T22:00:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00 2018-11-03T22:00:00-03:00 +2018-11-03T22:30:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00 2018-11-03T22:00:00-03:00 +2018-11-03T23:30:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00 2018-11-03T23:00:00-03:00 +2018-11-04T01:00:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00 2018-11-04T01:00:00-02:00 +2018-11-04T01:30:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00 2018-11-04T01:00:00-02:00 +2018-11-04T02:00:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00 2018-11-04T02:00:00-02:00 +2018-11-04T02:30:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00 2018-11-04T02:00:00-02:00 + +statement ok +drop table timestamp_strings; + +statement ok +drop view timestamp_utc; + +statement ok +drop view timestamp_sao_paulo; # Demonstrate that strings are automatically coerced to timestamps (don't use TIMESTAMP)