Skip to main content

Cleaning Data with pandas

Reading Data
df = pd.read_json('data/nobel_winners_dirty.json')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052 entries, 0 to 1051
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   born_in         1052 non-null   object
...
dtypes: int64(1), object(11)
memory usage: 98.8+ KB
Cleaning String
df.name = df.name.str.replace('\*', '', regex=True)
df.name = df.name.str.strip()
Selecting Multiple Rows
# select the first 10 rows
df[0:10]

# create filter
mask = df.year > 2000
mask
0   False
1   False
2   True
...

winners_since_2000 = df[mask]
Finding Mixed Types
# result shows values in 'born_in' column only have string value type
set(df.born_in.map(type))
{str}
Replacing Strings
# replace empty string with a NaN
bi_col.replace('', np.nan, inplace=True)

Converting empty string to NaN prevents empty strings being counted as valid from count(), describe().

Removing Rows
df = df[df.born_in.isnull()]
df = df.drop('born_in', axis=1)
Finding Duplicates
# duplicated method creates a boolean mask for duplicates
dupes_by_name = df[df.duplicated('name')]

all_dupes = df[df.duplicated('name') | df.duplicated('name', keep='last')

all_dupes = df[df.duplicated('name', keep=False)

all_dupes = df[df.name.isin(dupes_by_name.name)]

for name, row in df.groupby('name')
  print(f'name:{name}, # rows: {len(row)}')

pd.concat([g for _,g in df.groupby('name') if len(g) > 1]['name']

every line does the same thing: filtering out duplicates by name

Sorting Data
df = pd.DataFrame({'name': ['zak', 'alice', 'bob', 'mike', 'bob', 'bob'],
                   'score': [4, 3, 5, 2, 3, 7]})
df.sort_values(['name', 'score'], ascending=[1,0])
name	score
1	alice	3
5	bob	7
2	bob	5
4	bob	3
3	mike	2
0	zak	4
View vs Copies
# System will give warning because the code is trying to alter a value of a copy of DataFrame
df['country'][709] = 'France'

# Instead, it should be done as this:
df.loc[709, 'country'] = 'France
Removing Duplicates
df[df.duplicated('name', keep=False)].sort_values(by='name')[['name', 'country', 'year', 'category']]

sort all of objects with name duplicates by its name

Dealing with Times and Dates
# default is errors='ignore'
pd.to_datetime(df.date_of_birth, errors='raise')

for i, row in df.iterrows():
  try:
    pd.to_datetime(row.date_of_death, errors='raise')
  except:
    print(f'{fow.date_of_death.ljust(30)}({row['name']}, {i})')

with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(with_death_dates.date_of_death, errors='coerce'))
with_death_dates[bad_dates][['category', 'date_of_death']]

df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')
# initializing pandas.DateTimeIndex class with df.date_of_birth
df['award_age'] = df.year - pd.DateTimeIndex(df.date_of_birth).year
Full Data-Cleaning Method
def clean_data(df):
  df = df.replace('', np.nan)
  # DataFrame containing the rows with born_in fields
  df_born_in = df[df.born_in.notnull()]
  df = df[df.born_in.isnull()]
  # drop 'born_in' column for no born_in DataFrame
  df = df.drop('born_in', axis=1)
  # drop year before 1st nobel year
  df.drop(df[df.year == 1809].index, inplace=True)
  # drop noticeable duplicate entries
  df = df[~(df.name == 'Marie Curie')]
  # index search and value manipulation by loc[]
  df.loc[(df.name == 'Marie Sk\u0142odowska-Curie') & (df.year == 1911), 'country'] = 'France'
  df = df[~((df.name == 'Sidney Altman') & (df.year == 1990))]
  # shuffle entries
  df = df.reindex(np.random.permutation(df.index))
  # removes duplicates from the DataFrame after randomizing the row order
  df = df.drop_duplicates(['name', 'year'])
  df = df.sort_index()
  # fill in void 'category' for 'Alexis Carrel'
  df.loc[df.name == 'Alexis Carrel', 'category'] = 'Physiology or Medicine'
  # fill in void 'gender'
  df.loc[df.name == 'Ragnar Granit', 'gender'] = 'male'
  df = df[df.gender.notnull()] # remove institutional prizes
  df.loc[df.name == 'Hiroshi Amano', 'date_of_birth'] = '11 Septeber 1960'
  df.date_of_birth = pd.to_datetime(df.date_of_birth)
  df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')
  df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year
  return df, df_born_in # df_born_in will be processed later
Adding the born_in column
df_clean, df_born_in = clean_data(df)

# clean up born_in
df_born_in.name = df_born_in.name.str.replace('\*', '', regex=True)
df_born_in.name = df_born_in.name.str.strip()
# only keep the first entry of duplicates (keep='first')
df_born_in.drop_duplicates(subset=['name'], inplace=True)
df_born_in.set_index('name', inplace=True)

def get_born_in(name):
  """ prints born_in if present, and returns nan for born_in column for born_in.isnull() """
  try:
    born_in = df_born_in.loc[name]['born_in']
    # print(f'name: {name}, born in: {born_in}')
  # if 'born_in' doesn't exist
  except:
    born_in = np.nan
  return born_in

df_wbi = df_clean.copy()
df_wbi['born_in'] = df_wbi['name'].apply(get_born_in)
Mergind DataFrames
df_winners_bios = pd.read_json('data/scrapy_nwinners_minibio.json')
# link is the one that matches (name was string-manipulated)
df_clean_bios = pd.merge(df_wbi, df_winners_bios, how='outer', on='link')

df_clean_bios = df_clean_bios[~df_clean_bios.isnull()].drop_duplicates(subset=['link', 'year'])
df_clean_bio.drop('image_urls', axis=1, inplace=True)

df_clean_bio.to_json('data/nobel_winners_cleaned.json', orient='records', date_format='iso')
To_SQL
import sqlalchemy

engine = sqlalchemy.create_engine('sqlite:///data/nobel_winners_clean.db')
df_clean_bios.to_sql('winners', engine, if_exists='replace')

df_read_sql = pd.read_sql('winners', engine)