Cleaning Data with pandas
Reading Data
df = pd.read_json('data/nobel_winners_dirty.json')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052 entries, 0 to 1051
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 born_in 1052 non-null object
...
dtypes: int64(1), object(11)
memory usage: 98.8+ KB
Cleaning String
df.name = df.name.str.replace('\*', '', regex=True)
df.name = df.name.str.strip()
Selecting Multiple Rows
# select the first 10 rows
df[0:10]
# create filter
mask = df.year > 2000
mask
0 False
1 False
2 True
...
winners_since_2000 = df[mask]
Finding Mixed Types
# result shows values in 'born_in' column only have string value type
set(df.born_in.map(type))
{str}
Replacing Strings
# replace empty string with a NaN
bi_col.replace('', np.nan, inplace=True)
Converting empty string to NaN
prevents empty strings being counted as valid from count()
, describe()
.
Removing Rows
df = df[df.born_in.isnull()]
df = df.drop('born_in', axis=1)
Finding Duplicates
# duplicated method creates a boolean mask for duplicates
dupes_by_name = df[df.duplicated('name')]
all_dupes = df[df.duplicated('name') | df.duplicated('name', keep='last')
all_dupes = df[df.duplicated('name', keep=False)
all_dupes = df[df.name.isin(dupes_by_name.name)]
for name, row in df.groupby('name')
print(f'name:{name}, # rows: {len(row)}')
pd.concat([g for _,g in df.groupby('name') if len(g) > 1]['name']
every line does the same thing: filtering out duplicates by name
Sorting Data
df = pd.DataFrame({'name': ['zak', 'alice', 'bob', 'mike', 'bob', 'bob'],
'score': [4, 3, 5, 2, 3, 7]})
df.sort_values(['name', 'score'], ascending=[1,0])
name score
1 alice 3
5 bob 7
2 bob 5
4 bob 3
3 mike 2
0 zak 4
View vs Copies
# System will give warning because the code is trying to alter a value of a copy of DataFrame
df['country'][709] = 'France'
# Instead, it should be done as this:
df.loc[709, 'country'] = 'France
Removing Duplicates
df[df.duplicated('name', keep=False)].sort_values(by='name')[['name', 'country', 'year', 'category']]
sort all of objects with name
duplicates by its name
Dealing with Times and Dates
# default is errors='ignore'
pd.to_datetime(df.date_of_birth, errors='raise')
for i, row in df.iterrows():
try:
pd.to_datetime(row.date_of_death, errors='raise')
except:
print(f'{fow.date_of_death.ljust(30)}({row['name']}, {i})')
with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(with_death_dates.date_of_death, errors='coerce'))
with_death_dates[bad_dates][['category', 'date_of_death']]
df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')
# initializing pandas.DateTimeIndex class with df.date_of_birth
df['award_age'] = df.year - pd.DateTimeIndex(df.date_of_birth).year
Full Data-Cleaning Method
def clean_data(df):
df = df.replace('', np.nan)
# DataFrame containing the rows with born_in fields
df_born_in = df[df.born_in.notnull()]
df = df[df.born_in.isnull()]
# drop 'born_in' column for no born_in DataFrame
df = df.drop('born_in', axis=1)
# drop year before 1st nobel year
df.drop(df[df.year == 1809].index, inplace=True)
# drop noticeable duplicate entries
df = df[~(df.name == 'Marie Curie')]
# index search and value manipulation by loc[]
df.loc[(df.name == 'Marie Sk\u0142odowska-Curie') & (df.year == 1911), 'country'] = 'France'
df = df[~((df.name == 'Sidney Altman') & (df.year == 1990))]
# shuffle entries
df = df.reindex(np.random.permutation(df.index))
# removes duplicates from the DataFrame after randomizing the row order
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index()
# fill in void 'category' for 'Alexis Carrel'
df.loc[df.name == 'Alexis Carrel', 'category'] = 'Physiology or Medicine'
# fill in void 'gender'
df.loc[df.name == 'Ragnar Granit', 'gender'] = 'male'
df = df[df.gender.notnull()] # remove institutional prizes
df.loc[df.name == 'Hiroshi Amano', 'date_of_birth'] = '11 Septeber 1960'
df.date_of_birth = pd.to_datetime(df.date_of_birth)
df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')
df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year
return df, df_born_in # df_born_in will be processed later
Adding the born_in
column
df_clean, df_born_in = clean_data(df)
# clean up born_in
df_born_in.name = df_born_in.name.str.replace('\*', '', regex=True)
df_born_in.name = df_born_in.name.str.strip()
# only keep the first entry of duplicates (keep='first')
df_born_in.drop_duplicates(subset=['name'], inplace=True)
df_born_in.set_index('name', inplace=True)
def get_born_in(name):
""" prints born_in if present, and returns nan for born_in column for born_in.isnull() """
try:
born_in = df_born_in.loc[name]['born_in']
# print(f'name: {name}, born in: {born_in}')
# if 'born_in' doesn't exist
except:
born_in = np.nan
return born_in
df_wbi = df_clean.copy()
df_wbi['born_in'] = df_wbi['name'].apply(get_born_in)
Mergind DataFrames
df_winners_bios = pd.read_json('data/scrapy_nwinners_minibio.json')
# link is the one that matches (name was string-manipulated)
df_clean_bios = pd.merge(df_wbi, df_winners_bios, how='outer', on='link')
df_clean_bios = df_clean_bios[~df_clean_bios.isnull()].drop_duplicates(subset=['link', 'year'])
df_clean_bio.drop('image_urls', axis=1, inplace=True)
df_clean_bio.to_json('data/nobel_winners_cleaned.json', orient='records', date_format='iso')
To_SQL
import sqlalchemy
engine = sqlalchemy.create_engine('sqlite:///data/nobel_winners_clean.db')
df_clean_bios.to_sql('winners', engine, if_exists='replace')
df_read_sql = pd.read_sql('winners', engine)