Prabhath Kota: Pandas Dataframes

Jan 10, 2020

Pandas Dataframes

# Info - lists all of our columns, also gives data types
df.info()

# Set options
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

# head
df.head(10)

# tail
df.tail(10)

# List all columns
df.columns

# Dataframe shape

print(df1.shape)

# Series - is a list / a column in a data frame
# A data frame is a container for multiple Series (columns)
people = {
'first' : ['John', 'Mary', 'Linda'],
'last' : ['Doe', 'Lee', 'Doe'],
'email' : ['johndoe@gmail.com', 'marylee@gmail.com', 'lindadoe@gmail.com']
}
df = pd.DataFrame(people)
type(df['email']) # gives pandas.core.series.Series
df['email'] # gives list of emails
df.email # gives same as above

pd.DataFrame.from_records  (list of tuples)

list = [('John',30,'Bangalore'), ('Doe',25,'Chennai')] df = pd.DataFrame.from_records(list, columns=['Name', 'Age', 'City']) print(df.head(10))

df['email'] Vs df.email

df['email'] is better to use over df.email
shape is an attribute of data frame, suppose you have a column name as shape in your DF
df['shape'] will give you result of shape column list
df.shape will not work properly as you expect

# iloc Vs loc (Integer Location Vs Location)
# iloc (Search by position)
df.iloc[0] # return a Series object for 0th index, access row with iloc
df.iloc[ [0, 1] ] # returns first two rows of data, returns a Data Frame
df.iloc[ [0, 1], 2] # returns email (2nd column) of first two rows
# loc (Search by label) (row_index, column_indexer)

df.loc[0] # Same as iloc, returns Series object
df.loc[ [ 0, 1] ] # Same as iloc, returns data frame object
df.loc[ [ 0, 1 ], 'email' ] # Instead of index, you can use column name
df.loc[ [ 0, 1 ], ['email', 'last_name'] ] # Filter by multiple column names for first two rows
df.loc[ 0, 'Hobby'] # Value of Hobby column for first row
df.loc[ 0:5, 'Hobby:Employment'] # Values of Hobby to Employment columns for first six rows, slicing is inclusive # see df.columns to get all the columns
df.loc[ df['email'] == 'abc@gmail.com', : ] # Rows of all with specific email (all columns)

# Get row by index
Name Age PinCode
0 Alex 10 500
1 Bob 12 600
2 Clarke 13 589

df.loc[2] # Gets object of Clarke by index
df1 = df.set_index('Name') # Set Name as index, inplace=True is also there
print(df1)

Age PinCode
Name
Alex 10 500
Bob 12 600
Clarke 13 589

df1.loc['Clarke'] # You can get the row by passing Name, since Name is the index

# Get unique values of Column (Yes/No)
df["gender"].value_counts()
Male 100
Female 200

# Unique

ll = ['John', 'Doe', 'Tom', 'Doe']

df1 = pd.Series(ll)

df2 = pd.unique(df1)

print(type(df2)) #<class 'numpy.ndarray'>

print(df2) #['John' 'Doe' 'Tom']

# Filter

df1_link_keys = df1[df1.link_key != '']

df1_no_link_keys = df1[df1.link_key == '']

# Filter multiple/specific columns
df1 = df[['first', 'last', 'email']] # Gives DF in return

# Filter isin

df1_subset = df1[df1.index.isin(100, 200)]

# Replace something with another

df1 = df1.replace('NULL', '')

df1 = df1.fillna('')

# Drop index

df1 = df1.drop(matched_index_list)

df1 = df1.drop(100)

# Column to list

df1_names_list = df1['name'].tolist() # column values to list

df1_index_list = df1.index.values.tolist() # index values to list

# Drop duplicates

df1.drop_duplicates(keep='last', inplace=True) # Keep the last occurence

# Drop duplicates based on specific columns

df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='last', inplace=True)

df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='first', inplace=True)

df1 = df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='last') # inplace is not true

df1 = df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='first') # inplace is not true

# Drop duplicates with state is empty

# if only one record with state as empty, keep that

# If two same rows with one state is there & other state is empty, filter one with empty

s_maxes = df1.groupby(['id', 'name', 'city']).state.transform(max)

df1 = df1.loc[df1.state == s_maxes]

# Dataframe To Table

table_dtype = {

'name': VARCHAR(),

'city': VARCHAR(),

'state': VARCHAR(),

'pincode': INTEGER()

}

df1.to_sql(con=self.db_conn, name=table_name, dtype=table_dtype,

if_exists='replace', index=False)

# Dataframe To JSON

df.to_json('data_frame.json', orient='table')

# From Table to Dataframe

raw_data_sql = f""" select id, name, city, state

from event_raw_data

"""

df1 = pd.read_sql(raw_data_sql, con=self.db_conn, index_col="id")

# Loop Dataframe

for index, row in df1.iterrows():

print(index, row.name, row.city, row.state)

# How to subtract rows of one pandas data frame from another?

# You want to achieve df1 - df2

df1 = df1[~df1.index.isin(df2.index)]

# Read CSV
df1 = pd.read_csv('data/test.csv')
df1.shape # (10, 5)

Prabhath Kota

Jan 10, 2020

Pandas Dataframes

No comments:

Post a Comment