May 20, 2020

Pandas Tutorial 3

1) Drop Nulls of one column in Pandas
Find isnull in a column:
isnull(master["playerID"]).value_counts()

Output:
False    7520
True      241
Name: playerID, dtype: int64

Drop nulls of specific columns (dropna)
master_orig = master.copy()
master = master.dropna(subset=["playerID"])
master.shape

2) Drop Nulls of multiple column in Pandas
how = 'all' # if all subset cols are nulls
how = 'any' # if any of the subset cols are nulls
df.dropna(subset=[col_list], how='all')
master = master.dropna(subset=["firstNHL", "lastNHL"], how="all")

3)

master1 = master[master["lastNHL"] >= 1980]
master1.shape # (4627, 31)

Vs

master1 = master.loc[master["lastNHL"] >= 1980]
master1.shape # (4627, 31)

But later is good, more performance with huge data


4) filter columns
master.filter(columns_to_keep).head()
(or)
master = master.filter(regex="(playerID|pos|^birth)|(Name$)")


5) Find DF memory usage
df.memory_usage()

def mem_mib(df):
    mem = df.memory_usage().sum() / (1024 * 1024)
    print(f'{mem}.2f Mib')

    
mem_mib(master) # 0.39 MiB
mem_mib(master_orig) # 1.84 MiB

6) Categorical
# A string variable consisting of only a few different values. 
# Converting such a string variable to a categorical variable will save some memory.

def make_categorical(df, col_name):
    df.loc[:, col_name] = pd.Categorical(df[col_name]) 

# to save memory
make_categorical(master, "pos")
make_categorical(master, "birthCountry")
make_categorical(master, "birthState")

7)
pd.read_pickle()

8) Joins
Default is inner join
pd.merge(df1, df2, how='left')

# We joining based on player id of both dfs
# If left df has PlayerId & right df has plrId
pd.merge(df1, df2, left_on='PlayerId', right_on='plrId')

# We joining based on player id of both DFs
# Say if left DF has player id as index
# Here resultant merge DF has index from right DF 
# left DF index (left_index) is not considered in merge dF
pd.merge(df1, df2, left_index=True, right_on='plrId')

# We joining based on player id of both dfs
# Say if right df has player id as index
# Here resultant merge DF has index from left DF 
# right DF index (right_index) is not considered in merge dF
pd.merge(df1, df2, left_on='PlayerId', right_index=True)

# We can even set DF index (set_index) and use
# left_index and right_index 
pd.merge(df1, df2.set_index("playerID", drop=True),
                            left_index=True, right_index=True).head()

# Indicator
# It creates additional column _merge
# It indicates both, left_only, right_only
merged = pd.merge(master2, scoring, left_index=True,
                  right_on="playerID", how="right", indicator=True)

merged["_merge"].value_counts()
both          28579
right_only       37
left_only         0
Name: _merge, dtype: int64

# Filter only right_only
merged[merged["_merge"] == "right_only"].head()

# Filter only right_only or left_only
merged[(merged["_merge"] == "right_only") | (merged["_merge"] == "left_only")].sample(3)
or
merged[merged["_merge"].str.endswith("only")].sample(5)

# Filter out 1:m (one to many)
try:
pd.merge(df1, df2, left_index=True, right_on='plrId', validate="1:m").head()
except Exception as e:
pass


8) Drop random records
df.drop(drop.sample(5).index)
-------
9) Longer to Wider format (pivot)
df.show()

playerID year Goals
10320 hlavaja01 2001 7.0
10322 hlavaja01 2002 1.0
10324 hlavaja01 2003 5.0
15873 markoan01 2001 5.0
15874 markoan01 2002 13.0
15875 markoan01 2003 6.0
18899 nylanmi01 2001 15.0
18900 nylanmi01 2002 0.0
18902 nylanmi01 2003 0.0

# Longer to Wider format conversion
pivot = df.pivot(index="playerID", columns="year", values="Goals")
year 2001 2002 2003
playerID
hlavaja01 7.0 1.0 5.0
markoan01 5.0 13.0 6.0
nylanmi01 15.0 0.0 0.0

pivot = pivot.reset_index()
pivot.columns.name = None
pivot

playerID 2001 2002 2003
0 hlavaja01 7.0 1.0 5.0
1 markoan01 5.0 13.0 6.0
2 nylanmi01 15.0 0.0 0.0

10) Wide to Long format (melt)
# melt()
# Pandas melt() function is used to change the DataFrame format from wide to long.
pivot.melt(id_vars="playerID", var_name="year", value_name="goals")
playerID year goals
0 hlavaja01 2001 7.0
1 markoan01 2001 5.0
2 nylanmi01 2001 15.0
3 hlavaja01 2002 1.0
4 markoan01 2002 13.0
5 nylanmi01 2002 0.0
6 hlavaja01 2003 5.0
7 markoan01 2003 6.0
8 nylanmi01 2003 0.0

-------
Pandas Multi-level Index

1) How to set multi-index
mi = df.set_index(['playerID', 'year'])
mi.head()

2) List multi-index values 
mi.index
MultiIndex([('aaltoan01', 1997),
            ('aaltoan01', 1998),
            ('zyuzian01', 2005),
            ('zyuzian01', 2006),
            ('zyuzian01', 2007)],
           names=['playerID', 'year'], length=28616)

3) len(mi.index.levels) # 2

4) mi.index.levels[0]
Index(['aaltoan01', 'abdelju01', 'abidra01', 'abrahth01', 'actonke01',
       'adamlu01', 'adamru01'], dtype='object', name='playerID', length=4627)

5) mi.index.levels[1]
Int64Index([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990],
           dtype='int64', name='year')

6) mi.groupby(level="year")['G'].max().head()
year
1980    68.0
1981    92.0
1982    71.0
1983    87.0
1984    73.0
Name: G, dtype: float64

7) idmax (gives index)
mi.groupby(level="year")['G'].idmax().head()
year
1980    (bossymi01, 1980)
1981    (gretzwa01, 1981)
1982    (gretzwa01, 1982)
1983    (gretzwa01, 1983)
1984    (gretzwa01, 1984)
Name: G, dtype: object

8) Filter based on above
mi.loc[mi.groupby(level="year")['G'].idxmax()].head()

firstName lastName pos Year Mon Day Country State City tmID GP G A Pts SOG
playerID year
bossymi01 1980 Mike Bossy R 1957.0 1.0 22.0 Canada QC Montreal NYI 79.0 68.0 51.0 119.0 315.0
gretzwa01 1981 Wayne Gretzky C 1961.0 1.0 26.0 Canada ON Brantford EDM 80.0 92.0 120.0 212.0 369.0
1982 Wayne Gretzky C 1961.0 1.0 26.0 Canada ON Brantford EDM 80.0 71.0 125.0 196.0 348.0
1983 Wayne Gretzky C 1961.0 1.0 26.0 Canada ON Brantford EDM 74.0 87.0 118.0 205.0 324.0
1984 Wayne Gretzky C 1961.0 1.0 26.0 Canada ON Brantford EDM 80.0 73.0 135.0 208.0 358.0

No comments:

Post a Comment