Jan 10, 2020

Pandas Dataframes

# Info - lists all of our columns, also gives data types
df.info()

# Set options
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

# head
df.head(10)

# tail
df.tail(10)

# List all columns
df.columns

# Dataframe shape
print(df1.shape)

# Series - is a list / a column in a data frame
# A data frame is a container for multiple Series (columns)
people = {
              'first' : ['John', 'Mary', 'Linda'],
              'last' : ['Doe', 'Lee', 'Doe'],
              'email' : ['johndoe@gmail.com', 'marylee@gmail.com', 'lindadoe@gmail.com']
 }
df = pd.DataFrame(people)
type(df['email'])  # gives pandas.core.series.Series
df['email']  # gives list of emails
df.email  # gives same as above

pd.DataFrame.from_records (list of tuples)
list = [('John',30,'Bangalore'), ('Doe',25,'Chennai')] df = pd.DataFrame.from_records(list, columns=['Name', 'Age', 'City']) print(df.head(10))

df['email'] Vs df.email 
  • df['email'] is better to use over df.email
  • shape is an attribute of data frame, suppose you have a column name as shape in your DF
  • df['shape'] will give you result of shape column list 
  • df.shape will not work properly as you expect

# iloc Vs loc (Integer Location Vs Location)
# iloc (Search by position)
    df.iloc[0] # return a Series object for 0th index, access row with iloc
    df.iloc[ [0, 1] ] # returns first two rows of data, returns a Data Frame
    df.iloc[ [0, 1],  2] # returns email (2nd column) of first two rows
# loc  
(Search by label) (row_index, column_indexer)
    df.loc[0] # Same as iloc, returns Series object
    df.loc[ [ 0, 1] ] # Same as iloc, returns data frame object
    df.loc[ [ 0, 1 ], 'email' ]  # Instead of index, you can use column name
    df.loc[ [ 0, 1 ], ['email', 'last_name'] ] # Filter by multiple column names for first two rows
    df.loc[ 0, 'Hobby']  # Value of Hobby column for first row
    df.loc[ 0:5, 'Hobby:Employment']  # Values of Hobby to Employment columns for first six rows, slicing is inclusive   # see df.columns to get all the columns
    
df.loc[ df['email'] == 'abc@gmail.com',  : ]  # Rows of all with specific email (all columns)

# Get row by index
       Name  Age PinCode
0    Alex   10     500
1     Bob   12     600
2  Clarke   13     589

df.loc[2] # Gets object of Clarke by index
df1 = df.set_index('Name')  # Set Name as index, inplace=True is also there
print(df1)

             Age PinCode
Name
Alex     10     500
Bob      12     600
Clarke   13     589

df1.loc['Clarke']  # You can get the row by passing Name, since Name is the index


# Get unique values of Column (Yes/No)
df["gender"].value_counts()
Male 100
Female 200

# Unique
ll = ['John', 'Doe', 'Tom', 'Doe']
df1 = pd.Series(ll)
df2 = pd.unique(df1)
print(type(df2)) #<class 'numpy.ndarray'>
print(df2)         #['John' 'Doe' 'Tom']

# Filter
df1_link_keys = df1[df1.link_key != '']
df1_no_link_keys = df1[df1.link_key == '']

# Filter multiple/specific columns
df1 = df[['first', 'last', 'email']]  # Gives DF in return

# Filter isin
df1_subset = df1[df1.index.isin(100, 200)]

# Replace something with another
df1 = df1.replace('NULL', '')
df1 = df1.fillna('')

# Drop index
df1 = df1.drop(matched_index_list)
df1 = df1.drop(100)

# Column to list
df1_names_list = df1['name'].tolist() # column values to list
df1_index_list = df1.index.values.tolist() # index values to list

# Drop duplicates
df1.drop_duplicates(keep='last', inplace=True)  # Keep the last occurence

# Drop duplicates based on specific columns
df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='last', inplace=True)
df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='first', inplace=True)

df1 = df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='last')  # inplace is not true
df1 = df1.drop_duplicates(subset=['id', 'name', 'city', 'state'], keep='first') # inplace is not true

# Drop duplicates with state is empty 
# if only one record with state as empty, keep that
# If two same rows with one state is there & other state is empty, filter one with empty
s_maxes = df1.groupby(['id', 'name', 'city']).state.transform(max)
df1 = df1.loc[df1.state == s_maxes]

# Dataframe To Table
table_dtype = {
                'name': VARCHAR(),
                'city': VARCHAR(),
                'state': VARCHAR(),
                'pincode': INTEGER()
              }
df1.to_sql(con=self.db_conn, name=table_name, dtype=table_dtype,
           if_exists='replace', index=False)

# Dataframe To JSON
df.to_json('data_frame.json', orient='table')

# From Table to Dataframe
raw_data_sql = f""" select id, name, city, state
from event_raw_data
                """
df1 = pd.read_sql(raw_data_sql, con=self.db_conn, index_col="id")

# Loop Dataframe
for index, row in df1.iterrows():
print(index, row.name, row.city, row.state)

# How to subtract rows of one pandas data frame from another?
# You want to achieve df1 - df2
df1 = df1[~df1.index.isin(df2.index)]

# Read CSV
df1 = pd.read_csv('data/test.csv')
df1.shape  # (10, 5)
















No comments:

Post a Comment