# Info - lists all of our columns, also gives data types
df.info()
# Set options
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)
# head
df.head(10)
# tail
df.tail(10)
# List all columns
df.columns
# Dataframe shape
print(df1.shape)
# Series - is a list / a column in a data frame
# A data frame is a container for multiple Series (columns)
people = {
'first' : ['John', 'Mary', 'Linda'],
'last' : ['Doe', 'Lee', 'Doe'],
'email' : ['johndoe@gmail.com', 'marylee@gmail.com', 'lindadoe@gmail.com']
}
df = pd.DataFrame(people)
type(df['email']) # gives pandas.core.series.Series
df['email'] # gives list of emails
df.email # gives same as above
pd.DataFrame.from_records (list of tuples)
list = [('John',30,'Bangalore'), ('Doe',25,'Chennai')]
df = pd.DataFrame.from_records(list, columns=['Name', 'Age', 'City'])
print(df.head(10))
df['email'] Vs df.email
- df['email'] is better to use over df.email
- shape is an attribute of data frame, suppose you have a column name as shape in your DF
- df['shape'] will give you result of shape column list
- df.shape will not work properly as you expect
# iloc Vs loc (Integer Location Vs Location)
# iloc (Search by position)
df.iloc[0] # return a Series object for 0th index, access row with iloc
df.iloc[ [0, 1] ] # returns first two rows of data, returns a Data Frame
df.iloc[ [0, 1], 2] # returns email (2nd column) of first two rows
# loc (Search by label) (row_index, column_indexer)
df.loc[0] # Same as iloc, returns Series object
df.loc[ [ 0, 1] ] # Same as iloc, returns data frame object
df.loc[ [ 0, 1 ], 'email' ] # Instead of index, you can use column name
df.loc[ [ 0, 1 ], ['email', 'last_name'] ] # Filter by multiple column names for first two rows
df.loc[ 0, 'Hobby'] # Value of Hobby column for first row
df.loc[ 0:5, 'Hobby:Employment'] # Values of Hobby to Employment columns for first six rows, slicing is inclusive # see df.columns to get all the columns
df.loc[ df['email'] == 'abc@gmail.com', : ] # Rows of all with specific email (all columns)
# Get row by index
Name Age PinCode
0 Alex 10 500
1 Bob 12 600
2 Clarke 13 589
df.loc[2] # Gets object of Clarke by index
df1 = df.set_index('Name') # Set Name as index, inplace=True is also there
print(df1)
Age PinCode
Name
Alex 10 500
Bob 12 600
Clarke 13 589
df1.loc['Clarke'] # You can get the row by passing Name, since Name is the index
# Get unique values of Column (Yes/No)
df["gender"].value_counts()
Male 100
Female 200
# Unique
ll = ['John', 'Doe', 'Tom', 'Doe']
df1 = pd.Series(ll)
df2 = pd.unique(df1)
print(type(df2)) #<class 'numpy.ndarray'>
print(df2) #['John' 'Doe' 'Tom']
# Filter
df1_link_keys = df1[df1.link_key != '']
df1_no_link_keys = df1[df1.link_key == '']
# Filter multiple/specific columns
df1 = df[['first', 'last', 'email']] # Gives DF in return
# Filter isin
df1_subset = df1[df1.index.isin(100, 200)]
# Replace something with another
df1 = df1.replace('NULL', '')
df1 = df1.fillna('')
# Drop index
df1 = df1.drop(matched_index_list)
df1 = df1.drop(100)
# Column to list
df1_names_list = df1['name'].tolist() # column values to list
df1_index_list = df1.index.values.tolist() # index values to list
# Drop duplicates
df1.drop_duplicates(keep='last', inplace=True) # Keep the last occurence
# Drop duplicates based on specific columns
df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='last', inplace=True)
df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='first', inplace=True)
df1 = df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='last') # inplace is not true
df1 = df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='first') # inplace is not true
# Drop duplicates with state is empty
# if only one record with state as empty, keep that
# If two same rows with one state is there & other state is empty, filter one with empty
s_maxes = df1.groupby(['id', 'name', 'city']).state.transform(max)
df1 = df1.loc[df1.state == s_maxes]
# Dataframe To Table
table_dtype = {
'name': VARCHAR(),
'city': VARCHAR(),
'state': VARCHAR(),
'pincode': INTEGER()
}
df1.to_sql(con=self.db_conn, name=table_name, dtype=table_dtype,
if_exists='replace', index=False)
# Dataframe To JSON
df.to_json('data_frame.json', orient='table')
# From Table to Dataframe
raw_data_sql = f""" select id, name, city, state
from event_raw_data
"""
df1 = pd.read_sql(raw_data_sql, con=self.db_conn, index_col="id")
# Loop Dataframe
for index, row in df1.iterrows():
print(index, row.name, row.city, row.state)
# How to subtract rows of one pandas data frame from another?
# You want to achieve df1 - df2
df1 = df1[~df1.index.isin(df2.index)]
# Read CSV
df1 = pd.read_csv('data/test.csv')
df1.shape # (10, 5)