9 Dates
9.1 Basics
- Test Function to identify various null values in columns having dates in different formats and to get minimum and maximum dates out of those
Python
def datecols(dfs=None, inc=None, exc=None, date_strings=None, na_dates=None, justified=20, max_print=3,
skip_warn=False):
import traceback #To get Line Numbers of Errors from Imported or Read Function Code
try:
import numpy as np, pandas as pd
from IPython.display import display
#
if dfs is None:
#Default Test Case
tst = pd.DataFrame() #NULL DataFrame
#YYYY-MM-DD Dates as Strings: OK, NOT 29, OK, NOT 30, OK Month, OK 99 Split, OK 2000
tst['org']=['2023-02-28', '2023-02-29', '2024-02-29', '2024-02-30', '2024-03-01',
'1999-11-30', '2000-02-01',
'1900-01-01', '0', 'NULL', None]
tst['aok']=['2024-01-15'] * tst.shape[0]
#Case-Insensitive Replacement of various kinds of NULLS by 'np.nan'
#Source is 'na_values' argument of pd.read_excel()
na_values =[' ', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
'1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'None', 'n/a',
'nan', 'np.nan', '0', '0.0', '0.00', '']
tmp = dict.fromkeys(['(?i)^{}$'.format(x) for x in na_values], np.nan)
tst['ymd'] = tst['org'].astype(str).replace(
to_replace=tmp.keys(), value=tmp.values(), regex=True)
#
tst['tms'] = pd.to_datetime(tst['ymd'], format='ISO8601', errors='coerce') #TimeStamp
tst['dat'] = tst['tms'].dt.date #Date
#
#Get N characters
dd = tst['ymd'].str[-2:] #Last 2
mm = tst['ymd'].str[5:7] #Middle
yy = tst['ymd'].str[:4] #First 4
tst['mdy'] = mm + '/' + dd + '/' + yy
tst['flt'] = (yy.str[-2:] + mm + dd).astype(float) #Float with NULLS
tst['num'] = tst['flt'].fillna(0).astype(int) #Integers of less than 6 length
#
letters = 'abcdefghijklmnopqrstuvwxyz'
tst['abc'] = list(letters[ : tst.shape[0]]) #Character Column
tst['drp'] = list(letters[-tst.shape[0] : ]) #Exclude Column
tst['nul'] = [None]*tst.shape[0] #ALL NULLS
tst['bol'] = [True]*tst.shape[0] #Boolean
#
#Dictionary of DF of Dictionary of Column Formats
dft = {'tst': {'ymd': '%Y-%m-%d', 'mdy':'%m/%d/%Y'}}
if False:
datecols(dfs = dict({'tst':tst}), inc=['abc'], exc=['drp'])
cpy = tst.copy().drop(columns=['abc'])
datecols(dfs = dict({'tst':tst, 'cpy':cpy}), inc=['flt', 'abc'], exc=['drp'])
if False: datecols(dfs = tst, exc=['drp']) #ERROR OK TST ONLY
datecols(dfs = [tst], exc=['drp']) #ERROR CAUGHT
datecols(dfs = dict({'tst':tst}), exc=['drp'], max_print=9)
datecols(dfs = dict({'tst':tst}), exc=['drp']) #OK without supplying NULLS
datecols(dfs = dict({'tst':tst}), exc=['drp'], na_dates='') #OK ERROR CAUGHT
tst['tms_x'] = tst['tms'].copy()
datecols(dfs = dict({'tst':tst}), exc=['tms_x'], skip_warn=True) #OK
datecols(dfs = dict({'tst':tst}), exc=['tms_x']) #OK
datecols(dfs = dict({'tst':tst}), na_dates=['1900-01-01']) #OK
datecols(dfs = dict({'tst':tst}), date_strings=dft, na_dates=['1900-01-01']) #OK
#
datecols(dfs = dict({'tst':tst}), exc=['aok','abc','drp','nul','bol'], date_strings=dft,
na_dates=['1900-01-01'], skip_warn=True)
return None #Required to Stop Debug
#
if type(dfs) is not pd.core.frame.DataFrame and type(dfs) is not dict:
raise TypeError(f"type('dfs') (DF or dict(DF)) != {type(dfs)}")
elif type(dfs) is pd.core.frame.DataFrame:
nm = [x for x in globals() if globals()[x] is dfs][0]
dfs = dict({nm: dfs})
date_strings = dict({nm: date_strings})
#
if na_dates is not None and type(na_dates) is not list and type(na_dates) is not str:
raise TypeError(f"type('na_dates') (date string or list) != {type(na_dates)}")
elif type(na_dates) is str:
na_dates = [na_dates]
#
#Convert User supplied Null Dates from List of String Dates to List of DATETIME
if na_dates is not None:
na_dates = pd.to_datetime(na_dates)
if na_dates.isna().any(): raise ValueError(f"Invalid 'na_dates': {na_dates}")
#
warn_miss= dict()
warn_101 = dict()
#
for key, df in dfs.items():
#Subset Columns: (All ∩ Included) - Excluded | Deep Copy | Reset Index
cols = df.columns.to_list()
if inc is not None: cols = [i for i in cols if i in inc]
if exc is not None: cols = [i for i in cols if i not in exc]
#Get a list of DateTime Columns that will not be considered
l_dtm = df.select_dtypes(include=['datetime64']).columns.to_list()
tmp = list(set(l_dtm).difference(cols))
if tmp != list(): warn_miss[key] = tmp
#
df = df[cols].copy(deep=True).reset_index(drop=True)
#Show DF
if False:
with pd.option_context( 'display.max_rows', None, 'display.max_columns', None,
'display.width', None): display(df.head(20))
#
l_rows = [] #Empty List to be appended with a dictionary in each iteration
njust = len(max(cols + [key], key=len)) #Get Maximum Length from a List of Strings
for idx, c in enumerate(cols):
org = df[c]
mod = org.copy()
#Convert Numerics to Strings
if mod.dtype.kind in 'f': mod = pd.Series(np.where(mod.isnull(), None,
mod.apply('{:.0f}'.format))) #floats
if mod.dtype.kind in 'i': mod = mod.astype(str) #integers
#
if mod.dtype.kind in 'O':
#Cast as String to Convert 'datetime.date' to Strings
tmp = mod[mod.notna()].astype(str)
if len(tmp) != 0:
lmax = len(max(tmp, key=len))
lmin = len(min(tmp, key=len))
if lmax == 6 and lmax != lmin: mod = mod.str.zfill(lmax) #zero-padding
#
if (mod=='000101').any():
if key not in warn_101.keys(): warn_101[key] = list()
warn_101[key] = warn_101[key] + [c]
#
#YY between 00 and 30 to be taken as 21st Century [2000, 2099]
if lmax==6: mod = pd.Series(np.where(
mod.str[:2].astype(float) <= 30, '20'+mod, '19'+mod))
#
#If available, Get User-Supplied String Format
u_dt_str = None
is_dt_str= (date_strings is not None and key in date_strings.keys()
and date_strings[key] is not None and c in date_strings[key].keys())
if is_dt_str: u_dt_str = date_strings[key][c]
#If User Supplies wrong-format for DF-Column combination,
#...it would result in lots of Nulls
#
mod = pd.to_datetime(mod, errors='coerce', format=u_dt_str) #Get DateTime
#
#If Value is NOT in User NULLS, Then Keep the Value, Else NULL
if na_dates is not None: mod = mod.where(~mod.isin(na_dates), pd.NaT)
#
tmp = pd.DataFrame({'original': org, 'modified': mod})
#display(tmp)
#
sum_na = tmp['modified'].isna().sum() #Select Column by Name and Count Nulls
sum_na_o= tmp.iloc[ :, 0].isna().sum() #Select Column by Position
#
#'Minimum'.center(justified)
one_row = {'Columns': c, 'Minimum': '', 'Maximum':'', 'Nulls': '',
'pct':'', 'orig':'', 'uniq':'', 'samples':''}
#
no_nulls= tmp.query("modified.notna()")
uniq_nul= tmp.query("original.notna() and modified.isna()"
)['original'].unique().tolist()
#Find Min/Max Value in Modified Column | Get corresponding Original |
#Pull First Row Value using Position | To String | Left Justified by N
if no_nulls.shape[0] != 0:
one_row['Minimum'] = str(no_nulls.query("modified == modified.min()"
)['original'].iloc[0]).ljust(justified)
one_row['Maximum'] = str(no_nulls.query("modified == modified.max()"
)['original'].iloc[0]).ljust(justified)
#
if sum_na != 0:
#Percentage Cleanup
pct = 100 * sum_na/df.shape[0]
if pct == 100:
pct = '100%'
elif pct > 99:
pct = '>99%'
elif pct < 1:
pct = ' <1%'
else:
pct = (str(round(pct)) + '%').rjust(4)
#
one_row['Nulls'] = sum_na
one_row['pct'] = pct
if sum_na != sum_na_o: one_row['orig']= sum_na_o
one_row['uniq'] = len(uniq_nul)
one_row['samples']= uniq_nul[ :max_print]
#
l_rows.append(one_row)
#
out = pd.DataFrame(l_rows) #List of Dictionary to DataFrame
out.rename(columns = {'Columns': key}, inplace=True)
#
#Print ALL Columns and ALL Rows of the DataFrame
with pd.option_context( 'display.max_rows', None, 'display.max_columns', None,
'display.width', None):
display(out)
#
print()
#
if not skip_warn:
if warn_miss != dict(): print(f"Warning: Missed DateTime Columns: {warn_miss}")
if warn_101 != dict(): print(f"Warning: '101' == 2000-01-01 != 1900-01-01: {warn_101}")
print()
#
return None
except:
print(traceback.format_exc())
raise #To Stop Code Execution after Errors
#
#
Python
datecols()
## tst Minimum Maximum Nulls pct orig uniq \
## 0 org 1999-11-30 2024-03-01 6 55% 1 5
## 1 ymd 1999-11-30 2024-03-01 6 55% 3 3
## 2 tms 1999-11-30 00:00:00 2024-03-01 00:00:00 6 55% 5 1
## 3 dat 1999-11-30 2024-03-01 6 55% 5 1
## 4 mdy 11/30/1999 03/01/2024 6 55% 3 3
## 5 flt 991130.0 240301.0 5 45% 3 2
## 6 num 991130 240301 5 45% 0 3
##
## samples
## 0 [2023-02-29, 2024-02-30, 1900-01-01]
## 1 [2023-02-29, 2024-02-30, 1900-01-01]
## 2 [1900-01-01 00:00:00]
## 3 [1900-01-01]
## 4 [02/29/2023, 02/30/2024, 01/01/1900]
## 5 [230229.0, 240230.0]
## 6 [230229, 240230, 0]