Pandas DataFrame is a table with rows and columns
A group of Pandas Series-objects with common index
Operations:
- Create DF -> pd.DataFrame(data=ndArray/tuple/Dict => Iterable, index= array-like, columns=array-like)
- Grab one/many columns – myDF[‘newColName’] = myDF[‘oldCol’] / 100.00
- Grab one/many rows – myDF.iloc[0] or myDF.loc[‘IndexName’]
- Insert a new column – myDF[‘newColumname’] = myDF[‘someOldColum’] / 100.0
- Insert a new row – myDF.append(newRowSet)
- Aabhar : Aabhar : Jose Portilla (Head of Data Science at Pierian Training) @Udemy
import numpy as np
import pandas as pd
# ########## CREATING A DATAFRAME
np.random.seed(101) # This will ensure that you get the same set of random number as many times you run it.
myData = np.random.randint(0,101,(4,3))
print(myData)
""" RES ->
[[95 11 81]
[70 63 87]
[75 9 77]
[40 4 63]]
"""
myDf = pd.DataFrame(data=myData)
print(myDf) # By default the row index and column index will be 0, 1, 2 ...
""" RES ->
0 1 2
0 95 11 81
1 70 63 87
2 75 9 77
3 40 4 63
"""
myDf = pd.DataFrame(data=myData, index=["Apple", "Berry", "Cherry", "Dates"], columns=["Jan", "Feb", "Mar"])
print(myDf)
""" RES ->
Jan Feb Mar
Apple 95 11 81
Berry 70 63 87
Cherry 75 9 77
Dates 40 4 63
"""
myDf = pd.read_csv(filepath_or_buffer='D:\\DataScienceLearning\\PythonPrograms\\RK_PGMS\myData.csv')
print(myDf.columns)
# RES -> Index(['name', 'region', 'numberrange', 'currency', 'country'], dtype='object')
print(myDf.index)
# RES -> RangeIndex(start=0, stop=40, step=1)
print(myDf.info())
""" RES -> Talks about each column(i.e. 5) and toal rows(i.e. 40)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 40 non-null object
1 region 40 non-null object
2 numberrange 40 non-null int64
3 currency 40 non-null object
4 country 40 non-null object
dtypes: int64(1), object(4)
memory usage: 1.7+ KB
None
"""
print()
# In Jupyter-notebook try printing without "print"
# You should be getting a better interface - ( a well designed table)
print(myDf.head(2)) # By default first 5 entries
""" RES ->
name region numberrange currency country
0 Mufutau Moon Quảng Bình 6 $99.10 United Kingdom
1 Harrison Bass Stockholms län 9 $34.75 Mexico
"""
print(myDf.tail(3)) # By default last 5 entries
""" RES ->
name region numberrange currency country
37 Otto Ray Bihar 3 $9.07 Colombia
38 Blake Fitzgerald South Island 4 $15.79 Mexico
39 Clarke Harrell Jönköpings län 5 $45.20 New Zealand
"""
# Consider the number type of columns and find the statistical values
print(myDf.describe())
""" RES ->
numberrange
count 40.000000
mean 4.950000
std 2.630687
min 0.000000
25% 3.000000
50% 5.000000
75% 7.000000
max 10.000000
"""
# Use transpose to see the Data Frame in a better look and feel
print(myDf.describe().transpose())
""" RES ->
count mean std min 25% 50% 75% max
numberrange 40.0 4.95 2.630687 0.0 3.0 5.0 7.0 10.0
"""
# Accessing selected columns from DF
# Each column of the DF is basically a Pandas series
print(type(myDf['name']), "---", type(myDf['numberrange']))
# RES -> <class 'pandas.core.series.Series'> --- <class 'pandas.core.series.Series'>
# Look at here passing column name as array - myDF [ []] - Two square brackets if more than one column
print(myDf[ ['name', 'numberrange'] ].head(3))
""" RES ->
name numberrange
0 Mufutau Moon 6
1 Harrison Bass 9
2 Leo Cruz 10
"""
# Creating additional column to the DF
myDf['XX'] = myDf['numberrange']*2
print(myDf.head(2))
""" RES ->
name region numberrange currency country XX
0 Mufutau Moon Quảng Bình 6 $99.10 United Kingdom 12
1 Harrison Bass Stockholms län 9 $34.75 Mexico 18
"""
# Removing a column
myDf2 = myDf.drop('XX', axis=1, inplace=False) # Setting true will modify the DF
print(myDf2.head(2))
""" RES ->
name region numberrange currency country
0 Mufutau Moon Quảng Bình 6 $99.10 United Kingdom
1 Harrison Bass Stockholms län 9 $34.75 Mexico
"""
# Removing a row - you need to pass the row index
myDf3 = myDf.drop([36,37], axis=0, inplace=False)
print(myDf3.tail(4))
""" RES ->
name region numberrange currency country XX
34 Illana Peck Sardegna 5 $28.04 Norway 10
35 Ima Hawkins Querétaro 9 $45.63 United Kingdom 18
38 Blake Fitzgerald South Island 4 $15.79 Mexico 8
39 Clarke Harrell Jönköpings län 5 $45.20 New Zealand 10
"""
# Setting up a new index to your DF rather than default 0,1,2
myDf3.set_index('XX', inplace=True)
print(myDf3.head(2))
""" RES -> Look XX became the index now
name region numberrange currency country
XX
12 Mufutau Moon Quảng Bình 6 $99.10 United Kingdom
18 Harrison Bass Stockholms län 9 $34.75 Mexico
"""
myDf3.reset_index() # This will bring the DF to original 0,1,2 - index pattern
print()
# Accessing particular rows - use iloc or loc
# Note: Return is a Series, not a DF
print(myDf.iloc[5:8])
""" RES ->
name region numberrange currency country XX
5 Zelda Gay Connacht 4 $7.75 Italy 8
6 Nichole Oliver Penza Oblast 5 $62.07 Nigeria 10
7 Anika Haynes Los Ríos 6 $44.45 France 12
region numberrange currency country XX
"""
# Let us set a new index
myDfX = myDf.set_index('name', inplace=False)
#print(myDfX.head(10))
# Conditional select (particular row by new index value)
print(myDfX.loc[['Mufutau Moon']])
""" RES ->
region numberrange currency country XX
name
Mufutau Moon Quảng Bình 6 $99.10 United Kingdom 12
"""
# Selected rows with selected column
# Check carefully the closing brackets
print(myDfX.loc[ ['Mufutau Moon', 'Cora Newton'], ['country', 'currency'] ])
""" RES ->
country currency
name
Mufutau Moon United Kingdom $99.10
Cora Newton Costa Rica $66.40
"""
# Append a new row - append - depricated, concat - not working
oneRowAsSeries = myDf.iloc[3]
myDf.count()
#type(oneRowAsSeries)
myNewDf = myDf.append(oneRowDf)
#myDf.concat()
print(myNewDf.count())
""" -> RES
name 41
region 41
numberrange 41
currency 41
country 41
XX 41
dtype: int64
"""
boolSeries = (myNewDf['numberrange'] %6 == 0)
print(myNewDf[boolSeries])
""" RES ->
name region numberrange currency country XX
0 Mufutau Moon Quảng Bình 6 $99.10 United Kingdom 12
7 Anika Haynes Los Ríos 6 $44.45 France 12
14 Ethan Powers South Island 6 $30.46 Brazil 12
22 Abbot Bird South Island 6 $71.36 Ukraine 12
27 Ivana Bell Valle d'Aosta 0 $76.95 Chile 0
29 Phoebe Goff Arkansas 6 $40.60 Costa Rica 12
"""
#myOptions = ['Mufutau Moon', 'Ivana Bel']
#myDF.isin(myOptions)
#myNewDf[myNewDf.isin(myOptions)]