Python Pandas – 02 – DataFrame – A quick reference

Pandas DataFrame is a table with rows and columns
A group of Pandas Series-objects with common index

Operations:

  1. Create DF -> pd.DataFrame(data=ndArray/tuple/Dict => Iterable, index= array-like, columns=array-like)
  2. Grab one/many columns – myDF[‘newColName’] = myDF[‘oldCol’] / 100.00
  3. Grab one/many rows – myDF.iloc[0] or myDF.loc[‘IndexName’]
  4. Insert a new column – myDF[‘newColumname’] = myDF[‘someOldColum’] / 100.0
  5. Insert a new row – myDF.append(newRowSet)
  6. Aabhar : Aabhar : Jose Portilla (Head of Data Science at Pierian Training) @Udemy
import numpy as np
import pandas as pd

# ########## CREATING A DATAFRAME
np.random.seed(101) # This will ensure that you get the same set of random number as many times you run it.
myData = np.random.randint(0,101,(4,3))
print(myData)
""" RES ->
[[95 11 81]
 [70 63 87]
 [75  9 77]
 [40  4 63]]
"""
myDf = pd.DataFrame(data=myData)
print(myDf) # By default the row index and column index will be 0, 1, 2 ...
""" RES -> 
    0   1   2
0  95  11  81
1  70  63  87
2  75   9  77
3  40   4  63
"""
myDf = pd.DataFrame(data=myData, index=["Apple", "Berry", "Cherry", "Dates"], columns=["Jan", "Feb", "Mar"])
print(myDf)
""" RES -> 
        Jan  Feb  Mar
Apple    95   11   81
Berry    70   63   87
Cherry   75    9   77
Dates    40    4   63
"""

myDf = pd.read_csv(filepath_or_buffer='D:\\DataScienceLearning\\PythonPrograms\\RK_PGMS\myData.csv')
print(myDf.columns)
# RES -> Index(['name', 'region', 'numberrange', 'currency', 'country'], dtype='object')
print(myDf.index)
# RES -> RangeIndex(start=0, stop=40, step=1)
print(myDf.info())
""" RES -> Talks about each column(i.e. 5) and toal rows(i.e. 40)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         40 non-null     object
 1   region       40 non-null     object
 2   numberrange  40 non-null     int64 
 3   currency     40 non-null     object
 4   country      40 non-null     object
dtypes: int64(1), object(4)
memory usage: 1.7+ KB
None
"""
print()
# In Jupyter-notebook try printing without "print"
# You should be getting a better interface - ( a well designed table)
print(myDf.head(2)) # By default first 5 entries
""" RES -> 
           name          region  numberrange currency         country
0   Mufutau Moon      Quảng Bình            6  $99.10   United Kingdom
1  Harrison Bass  Stockholms län            9  $34.75           Mexico
"""

print(myDf.tail(3)) # By default last 5 entries
""" RES -> 
                name          region  numberrange currency      country
37          Otto Ray           Bihar            3   $9.07      Colombia
38  Blake Fitzgerald    South Island            4  $15.79        Mexico
39    Clarke Harrell  Jönköpings län            5  $45.20   New Zealand
"""

# Consider the number type of columns and find the statistical values
print(myDf.describe())

""" RES -> 
       numberrange
count    40.000000
mean      4.950000
std       2.630687
min       0.000000
25%       3.000000
50%       5.000000
75%       7.000000
max      10.000000
"""
# Use transpose to see the Data Frame in a better look and feel
print(myDf.describe().transpose())
""" RES -> 
            count  mean       std  min  25%  50%  75%   max
numberrange   40.0  4.95  2.630687  0.0  3.0  5.0  7.0  10.0
"""

# Accessing selected columns from DF

# Each column of the DF is basically a Pandas series
print(type(myDf['name']), "---", type(myDf['numberrange']))
# RES -> <class 'pandas.core.series.Series'> --- <class 'pandas.core.series.Series'>

# Look at here passing column name as array - myDF [ []] - Two square brackets if more than one column
print(myDf[ ['name', 'numberrange'] ].head(3))
""" RES -> 
            name  numberrange
0   Mufutau Moon            6
1  Harrison Bass            9
2       Leo Cruz           10
"""

# Creating additional column to the DF
myDf['XX'] = myDf['numberrange']*2
print(myDf.head(2))
""" RES -> 
            name          region  numberrange currency         country  XX
0   Mufutau Moon      Quảng Bình            6  $99.10   United Kingdom  12
1  Harrison Bass  Stockholms län            9  $34.75           Mexico  18
"""

# Removing a column
myDf2 = myDf.drop('XX', axis=1, inplace=False) # Setting true will modify the DF
print(myDf2.head(2))
""" RES -> 
            name          region  numberrange currency         country
0   Mufutau Moon      Quảng Bình            6  $99.10   United Kingdom
1  Harrison Bass  Stockholms län            9  $34.75           Mexico
"""

# Removing a row - you need to pass the row index
myDf3 = myDf.drop([36,37], axis=0, inplace=False)
print(myDf3.tail(4))
""" RES -> 
                name          region  numberrange currency         country  XX
34       Illana Peck        Sardegna            5  $28.04           Norway  10
35       Ima Hawkins       Querétaro            9  $45.63   United Kingdom  18
38  Blake Fitzgerald    South Island            4  $15.79           Mexico   8
39    Clarke Harrell  Jönköpings län            5  $45.20      New Zealand  10   
"""

# Setting up a new index to your DF rather than default 0,1,2
myDf3.set_index('XX', inplace=True)
print(myDf3.head(2))
""" RES ->  Look XX became the index now
             name          region  numberrange currency         country
XX                                                                     
12   Mufutau Moon      Quảng Bình            6  $99.10   United Kingdom
18  Harrison Bass  Stockholms län            9  $34.75           Mexico
"""
myDf3.reset_index()  # This will bring the DF to original 0,1,2 - index pattern
print()
# Accessing particular rows - use iloc or loc
# Note: Return is a Series, not a DF
print(myDf.iloc[5:8])
""" RES -> 
             name        region  numberrange currency  country  XX
5       Zelda Gay      Connacht            4   $7.75     Italy   8
6  Nichole Oliver  Penza Oblast            5  $62.07   Nigeria  10
7    Anika Haynes      Los Ríos            6  $44.45    France  12
                  region  numberrange currency         country  XX
"""
# Let us set a new index
myDfX = myDf.set_index('name', inplace=False)
#print(myDfX.head(10))
# Conditional select (particular row by new index value)
print(myDfX.loc[['Mufutau Moon']])
""" RES -> 
                  region  numberrange currency         country  XX
name                                                              
Mufutau Moon  Quảng Bình            6  $99.10   United Kingdom  12
"""

# Selected rows with selected column
# Check carefully the closing brackets
print(myDfX.loc[ ['Mufutau Moon', 'Cora Newton'], ['country', 'currency'] ])
""" RES -> 
                     country currency
name                                 
Mufutau Moon  United Kingdom  $99.10 
Cora Newton       Costa Rica  $66.40 
"""
# Append a new row - append - depricated, concat - not working
oneRowAsSeries = myDf.iloc[3]
myDf.count()
#type(oneRowAsSeries)
myNewDf = myDf.append(oneRowDf)
#myDf.concat()
print(myNewDf.count())
""" -> RES 
name           41
region         41
numberrange    41
currency       41
country        41
XX             41
dtype: int64
"""
boolSeries = (myNewDf['numberrange'] %6 == 0)

print(myNewDf[boolSeries])
""" RES ->
            name         region  numberrange currency         country  XX
0   Mufutau Moon     Quảng Bình            6  $99.10   United Kingdom  12
7   Anika Haynes       Los Ríos            6  $44.45           France  12
14  Ethan Powers   South Island            6  $30.46           Brazil  12
22    Abbot Bird   South Island            6  $71.36          Ukraine  12
27    Ivana Bell  Valle d'Aosta            0  $76.95            Chile   0
29   Phoebe Goff       Arkansas            6  $40.60       Costa Rica  12
"""
#myOptions = ['Mufutau Moon', 'Ivana Bel']
#myDF.isin(myOptions)
#myNewDf[myNewDf.isin(myOptions)]

Python Pandas – 01 – Series – A quick reference

  1. pandas is an open source, BSD-licensed library providing high-performance,
    easy-to-use data structures and data analysis tools for the Python programming language
  2. Support for the extremely powerful table i.e DATAFRAME system built off of NumPy
  3. Tools for reading/writing bwn many formats ( Can interact with HTML file, SQL databases too!)
  4. Intelligent grabbing of data based on the indexing/logic/subset etc.
  5. Handle missing data
  6. Adjust and restucture data structure
  7. Main Documentation Link : https://pandas.pydata.org/docs/
  8. Aabhar : Jose Portilla (Head of Data Science at Pierian Training) @Udemy
  9. SERIES -> 1 Dimensional ndarray with axis label
  10. Seris is a data structure in Pandas lib that holds an array of information along with a named index
  11. How to install pandas -> pip install pandas
  12. In case of error – ModuleNotFoundError: No module named ‘pandas’, open jupyter, Terminal -> Run Terminal ->(Type) pip install pandas – Successfully installed pandas-1.5.3 pytz-2022.7.1 – after installation restart your Jupyter kernel
  13. myScoreSeries = pd.Series(data=[55,35.0,’SeventyFive’], index=[‘Sachin’, ‘Dhoni’, ‘Kohli’])
import numpy as np
import pandas as pd

# ############# PANDA SERIES using Series() constructor
#help(pd.Series)  #Upper case S
myIndex = ['Sachin', 'Dhoni', 'Kohli']
myData = [55,35,75]


mySeries = pd.Series(data=myData)
print(type(mySeries))   
# RES -> <class 'pandas.core.series.Series'>
print(mySeries)  # By defult int indexed 
""" RES -> 
0    55
1    35
2    75
dtype: int64
"""

mySeries = pd.Series(data=myData, index=myIndex)
print(mySeries)
""" RES -> 
Sachin    55
Dhoni     35
Kohli     75
dtype: int64
"""

print(mySeries[0])
# RES -> 55
print(mySeries['Sachin'] , mySeries.shape)
# RES -> 55 (3,)   ---> 3 rows, 1 column

# Series using Python Dictionary
myDict = {"India" : "Best", "Australia" : "Better"}
mySer = pd.Series(myDict)
print(mySer)
""" RES -> 
India          Best
Australia    Better
dtype: object
"""

print(mySer.keys())  
# RES -> Index(['India', 'Australia'], dtype='object')
print(mySer.values) # Use as attribute
# RES -> ['Best', 'Better']

ser1 = {"India" : 44, "Japan" : 40, "USA" : 65 } 
ser2 = {"India" : 40, "Pak" : 24, "Nepal" : 20}

sales_q1 = pd.Series(ser1)
sales_q2 = pd.Series(ser2)

# Look what happens with a normal list
print([1, 2] * 3)
# RES -> [1, 2, 1, 2, 1, 2]

# Broadcasting -> the above operation is different in series
print(sales_q1 * 2)
""" RES -> 
India     88
Japan     80
USA      130
dtype: int64
"""
print(sales_q1 + sales_q2)  # Leave with NaN for the non matching keys from both the series
""" RES -> 
India    84.0
Japan     NaN
Nepal     NaN
Pak       NaN
USA       NaN
dtype: float64
"""
# For a meaningful operation on series use method add, sub, mul, div - NaN will be replaced by 0.0
print( sales_q1.add(sales_q2, fill_value = 0.0) )
""" RES -> 
India    84.0
Japan    40.0
Nepal    20.0
Pak      24.0
USA      65.0
dtype: float64
"""

# Traversing the series - We will have another post - it is not that straight forward
#for key in sales_q1:
#    print(sales_q1[key])

Python numpy – 02 – A quick reference

numpy – INDEXING AND SELECTION, OPERATIONS

  1. Grabbing one or more elements
  2. Broadcasting
  3. Grabbing of 2D array elements
  4. Conditional Selection of array elements
  5. 1D Array (+-*/) 1D Array,
  6. 2D Array -> Overall sum, rowwise sum, columnwise sum
  7. Aabhar : Jose Portilla (Head of Data Science at Pierian Training) @Udemy
import numpy as np

# SELECTION OF ARRAY ELEMENTS
arr = np.arange(1,11)
print(arr)
# RES -> [ 1  2  3  4  5  6  7  8  9 10]
print(arr[5])  
# RES ->6
print(arr[5:9])  #5th to 9th indexed value
# RES -> [6 7 8 9]
print(arr[5:]) # 5th till end
# RES -> [ 6  7  8  9 10]
print(arr[:5]) # From start till 5th
# RES -> [1 2 3 4 5]

# BROADCASTING
arr[0:3] = 999    # All 3 elements will be now 999 - means values are broadcasted to all the elements which is not same as in list
print(arr)
# RES -> [999 999 999   4   5   6   7   8   9  10]

slicedArr = arr[0:5]
print(slicedArr)
# RES -> [999 999 999   4   5]
slicedArr[:]= np.arange(300,305)
print(slicedArr)
print(arr)
# RES -> [300 301 302 303 304]
#        [300 301 302 303 304   6   7   8   9  10]


# PREVENT broadcasting, use copy
sliceArr2 = arr.copy()
sliceArr2[:] = 111
print(sliceArr2)
print(arr)
# RES -> [111 111 111 111 111 111 111 111 111 111]
#        [300 301 302 303 304   6   7   8   9  10]


# -----  2D array selection and indexing
arr2d = np.array([ [5,10,15], [20,25,30], [35,40,45]])
print(arr2d)
# RES -> [[ 5 10 15]
#         [20 25 30]
#         [35 40 45]]
print(arr2d[0]) # 1 row
# RES -> [ 5 10 15]
print(arr2d[0,2]) # 1st row last element - 15
# RES -> 15
print(arr2d[:2])  #Rows upto 2 excluding 2 means leave the 3rd row
# RES -> [[ 5 10 15]
#         [20 25 30]]
print(arr2d[:2, 1]) # Now form the extracted, 1st colum from all rows
# RES -> [10 25]
print(arr2d[:2, 1:]) # Now form the extracted, 1st and all the rest colum from all rows
# RES -> [[10 15]
#        [25 30]]

# CONDITIONAL SELECTION
x = np.arange(1,11)
boolX = x>8
print(boolX)
# RES -> [False False False False False False False False  True  True]
print(x[boolX])   # returns only those values > 8
# RES -> [ 9 10]
print(x[x<5]) # Shortcut way of writing
# RES -> [1 2 3 4]

# OPERATIONS
m = np.array([3,3,4])
print("m is : ", m)
# RES -> m is :  [3 3 4]
print(sum(m), max(m), min(m) , " ---", m.sum(), m.max(), m.min(), m.var(), m.std())
# RES -> 10 4 3  --- 10 4 3 0.22222222222222224 0.4714045207910317
print(m+5, m-m, m/2)
# RES -> [8 8 9] [0 0 0] [1.5 1.5 2. ]
print(np.sqrt(m), np.sin(m), np.log(m))
# RES -> [1.73205081 1.73205081 2.] 
#        [ 0.14112001  0.14112001 -0.7568025 ] 
#        [1.09861229 1.09861229 1.38629436]

# OPERATIONS on 2D array
m2d = np.arange(6).reshape(3,2)
print(m2d)
# RES -> [[0 1]
#         [2 3]
#         [4 5]]
print("Overall Sum : ", m2d.sum())
# RES -> 15
print("Column-wise Sum : ", m2d.sum(axis=0)) # 0 indicates rows -> Across the rows i.e downward
# RES -> [6 9]
print("Row-wise Sum : ", m2d.sum(axis=1)) # Across the columbs - horizontally
# RES -> [1 5 9]

Python numpy – 01 – A quick reference

  1. “numpy” is the fundamental package for scientific computing in Python.
  2. Python library for creating N-Dimensional array
  3. Great support for Liner-Algebra, Statistics Distribution, Trignometery
  4. Aabhar : Jose Portilla (Head of Data Science at Pierian Training) @Udemy
  5. Creating an array in 3 ways, accessing via index, some major methods
  6. Broadcasting and how to prevent
  7. 1D np.array([1,2]) 2D np.array([ [1,2], [3,4] ]) 3D np.array([ [ [1,2], [3,4] ] ])
import numpy as np
# ####  Create numpy array using  
# A) Transforming standard python list
myList = [1,2,3]
myArray = np.array(myList)
print(type(myArray), "---", myArray)
# RES -> <class 'numpy.ndarray'> --- [1 2 3]

# B) Using Built-in functions
print(np.arange(0,11,2))   # o to 10 in steps of 2
# RES -> [ 0  2  4  6  8 10]
print(np.linspace(0,10,3)) # Evenly distributed 3 number bwn 0 to 10
# RES -> [ 0.  5. 10.]
print(np.zeros((1,5))) # 1D array with 5 zeroes, nD possible
# RES -> [[0. 0. 0. 0. 0.]]
print(np.ones(5)) #1D array with 5 ones, only 1D possible
# RES -> [1. 1. 1. 1. 1.]
print(np.eye(3))  #3x3 matrix with diagonal elements as 1
# RES -> [[1. 0. 0.]
#         [0. 1. 0.]
#         [0. 0. 1.]]

# C) By Generating random data
print(np.random.rand(2,3)) # 2x3 matrix with every number bwn o to 1
# RES -> [[0.08564555 0.5295177  0.82151728]
#         [0.83742842 0.96277555 0.84243966]]


print(np.random.randint(5,10,3)) # 3 integer no. bwn 5 to 10 
# RES -> [6 6 8]
print(np.random.randint(5,55, (2,5)))  # any 12 integers between 5 to 55 in 2x5 format
# RES -> [[35 46 49 16 14]
#         [53 43 16 51 23]]

#randn : "standard normal" distribution with mean = 0 and SD = 1
print(np.random.randn(1,5))
# RES -> [[ 0.95835337 -1.70240379 -0.9072812  -0.51675364  1.25098128]]

#############  FEW MAJOR METHODS #############
x = np.arange(5,11)
print(x)
# RES -> [ 5  6  7  8  9 10]
print(x.max(), x.min(), x.argmax(), x.argmin() )  #max and min with their position value
# RES -> 10 5 5 0

#1 D array with 6 elements to it
print("X shape : " , x.shape)   # shape is property where as max() is method
# RES -> X shape :  (6,)
#2 D array, with 3 elements for each 1D array
y = x.reshape(2,3)
print(y)
# RES -> [[ 5  6  7]
#         [ 8  9 10]]

print("Y Shape : ", y.shape)
# RES -> Y Shape :  (2, 3)

#############  INDEXING & SLICING #############
arr = np.arange(1,11)
print(arr)
# RES -> [ 1  2  3  4  5  6  7  8  9 10]
print(arr[5])
#  RES -> 6
print(arr[5:9])  #5th to 9th indexed value
# RES -> [6 7 8 9]
print(arr[5:]) # 5th till end
# RES -> [ 6  7  8  9 10]
print(arr[:5]) # From start till 5th
# RES -> [1 2 3 4 5]

#############  BROADCASTING #############
myArr = np.arange(11,66,11)
print(myArr)
# RES -> [11 22 33 44 55]
myArr[0:3] = 99 # Value is broadcasted to 1st 3 elements
print(myArr)
# RES -> [99 99 99 44 55]

myArr2 = myArr  # This myArr2 is basically a pointer
myArr2[0] = 0   # Modifying via pointer and hence original array will change
print(myArr)
# RES -> [ 0 99 99 44 55]

# To prevent, use copy() method
myArr3 = myArr.copy()
myArr3[0] = -1
print(myArr)
# RES -> [ 0 99 99 44 55]

########### BROADCASTING - Advantage of bool array #####

x = np.arange(1,7)
print(x)
#RES -> [1 2 3 4 5 6]
boolX = x>3
print(boolX)
# RES -> [False False False  True  True  True]
print(x[boolX])   # returns only those values > 3
# RES -> [4 5 6]

###########  2D array selection and indexing ###########
# Array of 1D arrays
arr2d = np.array([ [5,10,15], [20,25,30], [35,40,45]])
print(arr2d)
# RES -> [[ 5 10 15]
#         [20 25 30]
#         [35 40 45]]
print(arr2d[0]) # 1 row
# RES -> [ 5 10 15]
print(arr2d[0,2]) # 1st row last element - 15
# RES -> 15
print(arr2d[:2])  #Rows upto 2 excluding 2 means leave the 3rd row
# RES -> [[ 5 10 15]
#         [20 25 30]]
print(arr2d[:2, 1]) # Now form the extracted, 1st colum from all rows
# RES -> [10 25]
print(arr2d[:2, 1:]) # Now form the extracted, 1st and all the rest colum from all rows
# RES -> [[10 15]
#         [25 30]]