Notes on data science and artificial intelligence VI. date and time preprocessing

Vi. date and time pretreatment

Split date and time into multiple features

# Loading Library
import pandas as pd

# Create data frame
df = pd.DataFrame()

# Create five dates
df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')

# Create feature for MM DD yyyy, HHM / S
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute

# Show three rows
df.head(3)

	date	year	month	day
0	2001-01-07	2001	1	7
1	2001-01-14	2001	1	14
2	2001-01-21	2001	1	21

Calculate the difference between date and time

# Loading Library
import pandas as pd

# Create data frame
df = pd.DataFrame()

# Create two datetime features
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]

# Calculate the spacing between features
df['Left'] - df['Arrived']

'''
0   0 days
1   2 days
dtype: timedelta64[ns] 
'''

# Calculate the spacing between features
pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))

'''
0    0
1    2
dtype: int64 
'''

Convert string to date

# Loading Library
import numpy as np
import pandas as pd

# Create string
date_strings = np.array(['03-04-2005 11:35 PM',
                         '23-05-2010 12:01 AM',
                         '04-09-2009 09:09 PM'])

If errors="coerce" then no problem will produce an error (the default behavior), but the value that caused the error is set to NaT (the missing value).

Code	describe	Example
%Y	Whole year	2001
%m	Zero filled month	04
%d	Zero filled date	09
%I	Zero filling hours (12 hours)	02
%p	AM or PM	AM
%M	Zero filled minutes	05
%S	Zero filled seconds	09

# Convert to datetime
[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce") for date in date_strings]

'''
[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')] 
'''

Convert time zone of pandas column

# Loading Library
import pandas as pd
from pytz import all_timezones

# Show ten time zones
all_timezones[0:10]

'''
['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Asmera',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau'] 
'''

# Create ten dates
dates = pd.Series(pd.date_range('2/2/2002', periods=10, freq='M'))

# Set time zone
dates_with_abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan')

# View pandas sequence
dates_with_abidjan_time_zone

'''
0   2002-02-28 00:00:00+00:00
1   2002-03-31 00:00:00+00:00
2   2002-04-30 00:00:00+00:00
3   2002-05-31 00:00:00+00:00
4   2002-06-30 00:00:00+00:00
5   2002-07-31 00:00:00+00:00
6   2002-08-31 00:00:00+00:00
7   2002-09-30 00:00:00+00:00
8   2002-10-31 00:00:00+00:00
9   2002-11-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan] 
'''

# Conversion time zone
dates_with_london_time_zone = dates_with_abidjan_time_zone.dt.tz_convert('Europe/London')

# View pandas sequence
dates_with_london_time_zone

'''
0   2002-02-28 00:00:00+00:00
1   2002-03-31 00:00:00+00:00
2   2002-04-30 01:00:00+01:00
3   2002-05-31 01:00:00+01:00
4   2002-06-30 01:00:00+01:00
5   2002-07-31 01:00:00+01:00
6   2002-08-31 01:00:00+01:00
7   2002-09-30 01:00:00+01:00
8   2002-10-31 00:00:00+00:00
9   2002-11-30 00:00:00+00:00
dtype: datetime64[ns, Europe/London] 
'''

Coding week

# Loading Library
import pandas as pd

# Create dataset
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))

# View data
dates

'''
0   2002-02-28
1   2002-03-31
2   2002-04-30
dtype: datetime64[ns] 
'''

# Check week
dates.dt.weekday_name

'''
0    Thursday
1      Sunday
2     Tuesday
dtype: object 
'''

Handling missing values in time series

# Loading Library
import pandas as pd
import numpy as np

# Date of creation
time_index = pd.date_range('01/01/2010', periods=5, freq='M')

# Create data frame, set index
df = pd.DataFrame(index=time_index)

# Create features with some missing values
df['Sales'] = [1.0,2.0,np.nan,np.nan,5.0]

# Interpolation of missing values
df.interpolate()

	Sales
2010-01-31	1.0
2010-02-28	2.0
2010-03-31	3.0
2010-04-30	4.0
2010-05-31	5.0

# Forward filling
df.ffill()

	Sales
2010-01-31	1.0
2010-02-28	2.0
2010-03-31	2.0
2010-04-30	2.0
2010-05-31	5.0

# Backward filling
df.bfill()

	Sales
2010-01-31	1.0
2010-02-28	2.0
2010-03-31	5.0
2010-04-30	5.0
2010-05-31	5.0

# Interpolation of missing values
df.interpolate(limit=1, limit_direction='forward')

	Sales
2010-01-31	1.0
2010-02-28	2.0
2010-03-31	3.0
2010-04-30	NaN
2010-05-31	5.0

Processing time zones

# Loading Library
import pandas as pd
from pytz import all_timezones

# Show ten time zones
all_timezones[0:10]

'''
['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Asmera',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau'] 
'''

# Create datetime
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')

# Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London') 

# Create datetime
date = pd.Timestamp('2017-05-01 06:00:00')

# Set time zone
date_in_london = date.tz_localize('Europe/London')

# Modify time zone
date_in_london.tz_convert('Africa/Abidjan')

# Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

Translation time feature

# Loading Library
import pandas as pd

# Create data frame
df = pd.DataFrame()

# Create data
df['dates'] = pd.date_range('1/1/2001', periods=5, freq='D')
df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]

# Translate values one row
df['previous_days_stock_price'] = df['stock_price'].shift(1)

# Display data frame
df

	dates	stock_price	previous_days_stock_price
0	2001-01-01	1.1	NaN
1	2001-01-02	2.2	1.1
2	2001-01-03	3.3	2.2
3	2001-01-04	4.4	3.3
4	2001-01-05	5.5	4.4

Slide time window

# Loading Library
import pandas as pd

# Create datetime
time_index = pd.date_range('01/01/2010', periods=5, freq='M')

# Create data frame, set index
df = pd.DataFrame(index=time_index)

# Create features
df['Stock_Price'] = [1,2,3,4,5]

# Calculate the sliding mean
df.rolling(window=2).mean()

	Stock_Price
2010-01-31	NaN
2010-02-28	1.5
2010-03-31	2.5
2010-04-30	3.5
2010-05-31	4.5

# Identify the maximum value in the sliding time window
df.rolling(window=2).max()

	Stock_Price
2010-01-31	NaN
2010-02-28	2.0
2010-03-31	3.0
2010-04-30	4.0
2010-05-31	5.0

Select date time range

# Loading Library
import pandas as pd

# Create data frame
df = pd.DataFrame()

# Create datetime
df['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')

Use this method if the data frame is not indexed by time.

# Select an observation between two dates and times
df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')]

	date
8762	2002-01-01 02:00:00
8763	2002-01-01 03:00:00
8764	2002-01-01 04:00:00

Use this method if data frames are indexed by time.

# catalog index
df = df.set_index(df['date'])

# Select an observation between two dates and times
df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']

	date
date
2002-01-01 01:00:00	2002-01-01 01:00:00
2002-01-01 02:00:00	2002-01-01 02:00:00
2002-01-01 03:00:00	2002-01-01 03:00:00
2002-01-01 04:00:00	2002-01-01 04:00:00

Added by wshell on Wed, 11 Dec 2019 02:55:44 +0200

Programming VIP