Python Pandas – 02 – DataFrame – A quick reference


Pandas DataFrame is a table with rows and columns
A group of Pandas Series-objects with common index

Operations:

  1. Create DF -> pd.DataFrame(data=ndArray/tuple/Dict => Iterable, index= array-like, columns=array-like)
  2. Grab one/many columns – myDF[‘newColName’] = myDF[‘oldCol’] / 100.00
  3. Grab one/many rows – myDF.iloc[0] or myDF.loc[‘IndexName’]
  4. Insert a new column – myDF[‘newColumname’] = myDF[‘someOldColum’] / 100.0
  5. Insert a new row – myDF.append(newRowSet)
  6. Aabhar : Aabhar : Jose Portilla (Head of Data Science at Pierian Training) @Udemy
import numpy as np
import pandas as pd

# ########## CREATING A DATAFRAME
np.random.seed(101) # This will ensure that you get the same set of random number as many times you run it.
myData = np.random.randint(0,101,(4,3))
print(myData)
""" RES ->
[[95 11 81]
 [70 63 87]
 [75  9 77]
 [40  4 63]]
"""
myDf = pd.DataFrame(data=myData)
print(myDf) # By default the row index and column index will be 0, 1, 2 ...
""" RES -> 
    0   1   2
0  95  11  81
1  70  63  87
2  75   9  77
3  40   4  63
"""
myDf = pd.DataFrame(data=myData, index=["Apple", "Berry", "Cherry", "Dates"], columns=["Jan", "Feb", "Mar"])
print(myDf)
""" RES -> 
        Jan  Feb  Mar
Apple    95   11   81
Berry    70   63   87
Cherry   75    9   77
Dates    40    4   63
"""

myDf = pd.read_csv(filepath_or_buffer='D:\\DataScienceLearning\\PythonPrograms\\RK_PGMS\myData.csv')
print(myDf.columns)
# RES -> Index(['name', 'region', 'numberrange', 'currency', 'country'], dtype='object')
print(myDf.index)
# RES -> RangeIndex(start=0, stop=40, step=1)
print(myDf.info())
""" RES -> Talks about each column(i.e. 5) and toal rows(i.e. 40)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         40 non-null     object
 1   region       40 non-null     object
 2   numberrange  40 non-null     int64 
 3   currency     40 non-null     object
 4   country      40 non-null     object
dtypes: int64(1), object(4)
memory usage: 1.7+ KB
None
"""
print()
# In Jupyter-notebook try printing without "print"
# You should be getting a better interface - ( a well designed table)
print(myDf.head(2)) # By default first 5 entries
""" RES -> 
           name          region  numberrange currency         country
0   Mufutau Moon      Quảng Bình            6  $99.10   United Kingdom
1  Harrison Bass  Stockholms län            9  $34.75           Mexico
"""

print(myDf.tail(3)) # By default last 5 entries
""" RES -> 
                name          region  numberrange currency      country
37          Otto Ray           Bihar            3   $9.07      Colombia
38  Blake Fitzgerald    South Island            4  $15.79        Mexico
39    Clarke Harrell  Jönköpings län            5  $45.20   New Zealand
"""

# Consider the number type of columns and find the statistical values
print(myDf.describe())

""" RES -> 
       numberrange
count    40.000000
mean      4.950000
std       2.630687
min       0.000000
25%       3.000000
50%       5.000000
75%       7.000000
max      10.000000
"""
# Use transpose to see the Data Frame in a better look and feel
print(myDf.describe().transpose())
""" RES -> 
            count  mean       std  min  25%  50%  75%   max
numberrange   40.0  4.95  2.630687  0.0  3.0  5.0  7.0  10.0
"""

# Accessing selected columns from DF

# Each column of the DF is basically a Pandas series
print(type(myDf['name']), "---", type(myDf['numberrange']))
# RES -> <class 'pandas.core.series.Series'> --- <class 'pandas.core.series.Series'>

# Look at here passing column name as array - myDF [ []] - Two square brackets if more than one column
print(myDf[ ['name', 'numberrange'] ].head(3))
""" RES -> 
            name  numberrange
0   Mufutau Moon            6
1  Harrison Bass            9
2       Leo Cruz           10
"""

# Creating additional column to the DF
myDf['XX'] = myDf['numberrange']*2
print(myDf.head(2))
""" RES -> 
            name          region  numberrange currency         country  XX
0   Mufutau Moon      Quảng Bình            6  $99.10   United Kingdom  12
1  Harrison Bass  Stockholms län            9  $34.75           Mexico  18
"""

# Removing a column
myDf2 = myDf.drop('XX', axis=1, inplace=False) # Setting true will modify the DF
print(myDf2.head(2))
""" RES -> 
            name          region  numberrange currency         country
0   Mufutau Moon      Quảng Bình            6  $99.10   United Kingdom
1  Harrison Bass  Stockholms län            9  $34.75           Mexico
"""

# Removing a row - you need to pass the row index
myDf3 = myDf.drop([36,37], axis=0, inplace=False)
print(myDf3.tail(4))
""" RES -> 
                name          region  numberrange currency         country  XX
34       Illana Peck        Sardegna            5  $28.04           Norway  10
35       Ima Hawkins       Querétaro            9  $45.63   United Kingdom  18
38  Blake Fitzgerald    South Island            4  $15.79           Mexico   8
39    Clarke Harrell  Jönköpings län            5  $45.20      New Zealand  10   
"""

# Setting up a new index to your DF rather than default 0,1,2
myDf3.set_index('XX', inplace=True)
print(myDf3.head(2))
""" RES ->  Look XX became the index now
             name          region  numberrange currency         country
XX                                                                     
12   Mufutau Moon      Quảng Bình            6  $99.10   United Kingdom
18  Harrison Bass  Stockholms län            9  $34.75           Mexico
"""
myDf3.reset_index()  # This will bring the DF to original 0,1,2 - index pattern
print()
# Accessing particular rows - use iloc or loc
# Note: Return is a Series, not a DF
print(myDf.iloc[5:8])
""" RES -> 
             name        region  numberrange currency  country  XX
5       Zelda Gay      Connacht            4   $7.75     Italy   8
6  Nichole Oliver  Penza Oblast            5  $62.07   Nigeria  10
7    Anika Haynes      Los Ríos            6  $44.45    France  12
                  region  numberrange currency         country  XX
"""
# Let us set a new index
myDfX = myDf.set_index('name', inplace=False)
#print(myDfX.head(10))
# Conditional select (particular row by new index value)
print(myDfX.loc[['Mufutau Moon']])
""" RES -> 
                  region  numberrange currency         country  XX
name                                                              
Mufutau Moon  Quảng Bình            6  $99.10   United Kingdom  12
"""

# Selected rows with selected column
# Check carefully the closing brackets
print(myDfX.loc[ ['Mufutau Moon', 'Cora Newton'], ['country', 'currency'] ])
""" RES -> 
                     country currency
name                                 
Mufutau Moon  United Kingdom  $99.10 
Cora Newton       Costa Rica  $66.40 
"""
# Append a new row - append - depricated, concat - not working
oneRowAsSeries = myDf.iloc[3]
myDf.count()
#type(oneRowAsSeries)
myNewDf = myDf.append(oneRowDf)
#myDf.concat()
print(myNewDf.count())
""" -> RES 
name           41
region         41
numberrange    41
currency       41
country        41
XX             41
dtype: int64
"""
boolSeries = (myNewDf['numberrange'] %6 == 0)

print(myNewDf[boolSeries])
""" RES ->
            name         region  numberrange currency         country  XX
0   Mufutau Moon     Quảng Bình            6  $99.10   United Kingdom  12
7   Anika Haynes       Los Ríos            6  $44.45           France  12
14  Ethan Powers   South Island            6  $30.46           Brazil  12
22    Abbot Bird   South Island            6  $71.36          Ukraine  12
27    Ivana Bell  Valle d'Aosta            0  $76.95            Chile   0
29   Phoebe Goff       Arkansas            6  $40.60       Costa Rica  12
"""
#myOptions = ['Mufutau Moon', 'Ivana Bel']
#myDF.isin(myOptions)
#myNewDf[myNewDf.isin(myOptions)]
Advertisement

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s