Vi. date and time pretreatment
Split date and time into multiple features
# Loading Library
import pandas as pd
# Create data frame
df = pd.DataFrame()
# Create five dates
df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')
# Create feature for MM DD yyyy, HHM / S
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
# Show three rows
df.head(3)
|
date |
year |
month |
day |
hour |
minute |
0 |
2001-01-07 |
2001 |
1 |
7 |
0 |
0 |
1 |
2001-01-14 |
2001 |
1 |
14 |
0 |
0 |
2 |
2001-01-21 |
2001 |
1 |
21 |
0 |
0 |
Calculate the difference between date and time
# Loading Library
import pandas as pd
# Create data frame
df = pd.DataFrame()
# Create two datetime features
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]
# Calculate the spacing between features
df['Left'] - df['Arrived']
'''
0 0 days
1 2 days
dtype: timedelta64[ns]
'''
# Calculate the spacing between features
pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))
'''
0 0
1 2
dtype: int64
'''
Convert string to date
# Loading Library
import numpy as np
import pandas as pd
# Create string
date_strings = np.array(['03-04-2005 11:35 PM',
'23-05-2010 12:01 AM',
'04-09-2009 09:09 PM'])
If errors="coerce" then no problem will produce an error (the default behavior), but the value that caused the error is set to NaT (the missing value).
Code |
describe |
Example |
%Y |
Whole year |
2001 |
%m |
Zero filled month |
04 |
%d |
Zero filled date |
09 |
%I |
Zero filling hours (12 hours) |
02 |
%p |
AM or PM |
AM |
%M |
Zero filled minutes |
05 |
%S |
Zero filled seconds |
09 |
# Convert to datetime
[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce") for date in date_strings]
'''
[Timestamp('2005-04-03 23:35:00'),
Timestamp('2010-05-23 00:01:00'),
Timestamp('2009-09-04 21:09:00')]
'''
Convert time zone of pandas column
# Loading Library
import pandas as pd
from pytz import all_timezones
# Show ten time zones
all_timezones[0:10]
'''
['Africa/Abidjan',
'Africa/Accra',
'Africa/Addis_Ababa',
'Africa/Algiers',
'Africa/Asmara',
'Africa/Asmera',
'Africa/Bamako',
'Africa/Bangui',
'Africa/Banjul',
'Africa/Bissau']
'''
# Create ten dates
dates = pd.Series(pd.date_range('2/2/2002', periods=10, freq='M'))
# Set time zone
dates_with_abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan')
# View pandas sequence
dates_with_abidjan_time_zone
'''
0 2002-02-28 00:00:00+00:00
1 2002-03-31 00:00:00+00:00
2 2002-04-30 00:00:00+00:00
3 2002-05-31 00:00:00+00:00
4 2002-06-30 00:00:00+00:00
5 2002-07-31 00:00:00+00:00
6 2002-08-31 00:00:00+00:00
7 2002-09-30 00:00:00+00:00
8 2002-10-31 00:00:00+00:00
9 2002-11-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]
'''
# Conversion time zone
dates_with_london_time_zone = dates_with_abidjan_time_zone.dt.tz_convert('Europe/London')
# View pandas sequence
dates_with_london_time_zone
'''
0 2002-02-28 00:00:00+00:00
1 2002-03-31 00:00:00+00:00
2 2002-04-30 01:00:00+01:00
3 2002-05-31 01:00:00+01:00
4 2002-06-30 01:00:00+01:00
5 2002-07-31 01:00:00+01:00
6 2002-08-31 01:00:00+01:00
7 2002-09-30 01:00:00+01:00
8 2002-10-31 00:00:00+00:00
9 2002-11-30 00:00:00+00:00
dtype: datetime64[ns, Europe/London]
'''
Coding week
# Loading Library
import pandas as pd
# Create dataset
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))
# View data
dates
'''
0 2002-02-28
1 2002-03-31
2 2002-04-30
dtype: datetime64[ns]
'''
# Check week
dates.dt.weekday_name
'''
0 Thursday
1 Sunday
2 Tuesday
dtype: object
'''
Handling missing values in time series
# Loading Library
import pandas as pd
import numpy as np
# Date of creation
time_index = pd.date_range('01/01/2010', periods=5, freq='M')
# Create data frame, set index
df = pd.DataFrame(index=time_index)
# Create features with some missing values
df['Sales'] = [1.0,2.0,np.nan,np.nan,5.0]
# Interpolation of missing values
df.interpolate()
|
Sales |
2010-01-31 |
1.0 |
2010-02-28 |
2.0 |
2010-03-31 |
3.0 |
2010-04-30 |
4.0 |
2010-05-31 |
5.0 |
# Forward filling
df.ffill()
|
Sales |
2010-01-31 |
1.0 |
2010-02-28 |
2.0 |
2010-03-31 |
2.0 |
2010-04-30 |
2.0 |
2010-05-31 |
5.0 |
# Backward filling
df.bfill()
|
Sales |
2010-01-31 |
1.0 |
2010-02-28 |
2.0 |
2010-03-31 |
5.0 |
2010-04-30 |
5.0 |
2010-05-31 |
5.0 |
# Interpolation of missing values
df.interpolate(limit=1, limit_direction='forward')
|
Sales |
2010-01-31 |
1.0 |
2010-02-28 |
2.0 |
2010-03-31 |
3.0 |
2010-04-30 |
NaN |
2010-05-31 |
5.0 |
Processing time zones
# Loading Library
import pandas as pd
from pytz import all_timezones
# Show ten time zones
all_timezones[0:10]
'''
['Africa/Abidjan',
'Africa/Accra',
'Africa/Addis_Ababa',
'Africa/Algiers',
'Africa/Asmara',
'Africa/Asmera',
'Africa/Bamako',
'Africa/Bangui',
'Africa/Banjul',
'Africa/Bissau']
'''
# Create datetime
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')
# Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')
# Create datetime
date = pd.Timestamp('2017-05-01 06:00:00')
# Set time zone
date_in_london = date.tz_localize('Europe/London')
# Modify time zone
date_in_london.tz_convert('Africa/Abidjan')
# Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')
Translation time feature
# Loading Library
import pandas as pd
# Create data frame
df = pd.DataFrame()
# Create data
df['dates'] = pd.date_range('1/1/2001', periods=5, freq='D')
df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
# Translate values one row
df['previous_days_stock_price'] = df['stock_price'].shift(1)
# Display data frame
df
|
dates |
stock_price |
previous_days_stock_price |
0 |
2001-01-01 |
1.1 |
NaN |
1 |
2001-01-02 |
2.2 |
1.1 |
2 |
2001-01-03 |
3.3 |
2.2 |
3 |
2001-01-04 |
4.4 |
3.3 |
4 |
2001-01-05 |
5.5 |
4.4 |
Slide time window
# Loading Library
import pandas as pd
# Create datetime
time_index = pd.date_range('01/01/2010', periods=5, freq='M')
# Create data frame, set index
df = pd.DataFrame(index=time_index)
# Create features
df['Stock_Price'] = [1,2,3,4,5]
# Calculate the sliding mean
df.rolling(window=2).mean()
|
Stock_Price |
2010-01-31 |
NaN |
2010-02-28 |
1.5 |
2010-03-31 |
2.5 |
2010-04-30 |
3.5 |
2010-05-31 |
4.5 |
# Identify the maximum value in the sliding time window
df.rolling(window=2).max()
|
Stock_Price |
2010-01-31 |
NaN |
2010-02-28 |
2.0 |
2010-03-31 |
3.0 |
2010-04-30 |
4.0 |
2010-05-31 |
5.0 |
Select date time range
# Loading Library
import pandas as pd
# Create data frame
df = pd.DataFrame()
# Create datetime
df['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')
Use this method if the data frame is not indexed by time.
# Select an observation between two dates and times
df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')]
|
date |
8762 |
2002-01-01 02:00:00 |
8763 |
2002-01-01 03:00:00 |
8764 |
2002-01-01 04:00:00 |
Use this method if data frames are indexed by time.
# catalog index
df = df.set_index(df['date'])
# Select an observation between two dates and times
df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']
|
date |
date |
|
2002-01-01 01:00:00 |
2002-01-01 01:00:00 |
2002-01-01 02:00:00 |
2002-01-01 02:00:00 |
2002-01-01 03:00:00 |
2002-01-01 03:00:00 |
2002-01-01 04:00:00 |
2002-01-01 04:00:00 |