Basic knowledge
conditionals
mark= 56
if mark>= 69.5:
print("distribution")
elif mark>= 59.5:
print("merit")
elif mark>= 50.0:
print("pass")
else:
print("Fail")
pass
Loops
numbers= [1,2,3,4,5,6]
for i in numbers:
print(i)
1
2
3
4
5
6
number_plus_one= []
number_plus_one= []
for i in numbers:
number_plus_one.append(i+1)
print("current number is: "+ str(i))
current number is: 1
current number is: 2
current number is: 3
current number is: 4
current number is: 5
current number is: 6
number_plus_one
[2, 3, 4, 5, 6, 7]
for i in range(1, 8, 2):
print(i)
1357
count= 0while count< 5: print(count) count+= 1
01234
count= 0while True: print(count) count+= 1 if count>= 5: break
01234
Functions
def my_func(): print("Hello from my func")
my_func()
Hello from my func
def my_func2(greeting, planet): print(greeting+ ' '+ planet+ '!')
my_func2('Hello', 'World')
Hello World!
def add(a, b): return a+ b
result= add(1, 2)print(result)
3
result
3
Python packages, and the numpy
package
import numpy as np
np.identity(5)
array([[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]])
from numpy import identityidentity(5)
array([[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]])
from matplotlib import pyplot as plt
plt.plot([1,2,3],[1,4,1],'or')
[<matplotlib.lines.Line2D at 0x208b1c47780>]

Numpy arrays
import numpy as np
arr= np.array([1.2, 3.14, -6.45])
arr
array([ 1.2 , 3.14, -6.45])
2* [23,42]
[23, 42, 23, 42]
2* arr
array([ 2.4 , 6.28, -12.9 ])
arr+ 1
array([ 2.2 , 4.14, -5.45])
arr* -1
array([-1.2 , -3.14, 6.45])
arr** 2
array([ 1.44 , 9.8596, 41.6025])
arr1= arrarr2= np.array([1, 2, 3])
print([1, 2 ,3]+ [3, 4, 5]) #listarr1+ arr2 #numpy array
[1, 2, 3, 3, 4, 5]
array([ 2.2 , 5.14, -3.45])
arr1* arr2
array([ 1.2 , 6.28, -19.35])
np.sin(1)
0.8414709848078965
np.sin(arr1)
array([ 0.93203909, 0.00159265, -0.16604211])
arr1[0]
1.2
arr[: 2]
array([1.2 , 3.14])
Numpy arrays pt 2
import numpy as np
a= np.array([[1, 2, 3],[4, 5, 6]])
a
array([[1, 2, 3], [4, 5, 6]])
a.shape #get the number of rows and cols
(2, 3)
nrows, ncols= a.shapeprint(nrows, ncols)
2 3
a.shape= 6
a
array([1, 2, 3, 4, 5, 6])
a.shape= [3, 2]
a
array([[1, 2], [3, 4], [5, 6]])
a.shape= [2, 3]
a
array([[1, 2, 3], [4, 5, 6]])
np.zeros(5)
array([0., 0., 0., 0., 0.])
np.zeros([3, 4])
array([[0., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]])
np.ones([3, 4])
array([[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 1., 1., 1.]])
np.arange(0.1, 0.2, 0.01) #start, stop, step
array([0.1 , 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19])
np.linspace(0.1, 0.2, 15) #rather than the step, the third one is the numbers of nums between the start and the stop
array([0.1 , 0.10714286, 0.11428571, 0.12142857, 0.12857143, 0.13571429, 0.14285714, 0.15 , 0.15714286, 0.16428571, 0.17142857, 0.17857143, 0.18571429, 0.19285714, 0.2 ])
a= np.array([1, 2, 3])b= np.array([4, 5, 6])
np.hstack([a, b]) #horizentally
array([1, 2, 3, 4, 5, 6])
np.vstack([a, b]) #vertically
array([[1, 2, 3], [4, 5, 6]])
More numpy
features (other than arrays)
import numpy as np
np.random.normal(1, 2, 5)# (mean, sd, scale)
array([0.26692905, 3.25555715, 5.55266956, 1.23445221, 1.85074104])
a= np.random.normal(1, 2, 10)
a
array([ 1.61286433, -2.2266842 , -2.06336836, 1.70629078, -0.01086119, 0.9961519 , 0.42980328, 4.31010475, -0.07291085, -2.21994622])
np.average(a)
0.2461444230839865
np.var(a)
3.8810147907170185
b= np.random.normal(0, 5, [5, 3])
b
array([[ 4.14403725, 4.93392806, -12.57708917], [ 3.74246174, 1.43706417, -3.8630664 ], [ 1.94530969, 6.6532378 , -3.39473863], [ 5.91008478, -8.03735424, 7.02867261], [ 4.88898932, 1.91366109, 3.04734046]])
np.average(b)
1.1848359007671332
np.average(b, axis= 0) # axis= 0 :row
array([ 4.12617655, 1.38010737, -1.95177623])
np.average(b, axis= 1) # axis= 1: col
array([-1.16637462, 0.43881984, 1.73460295, 1.63380105, 3.28333029])
m= np.matrix([[1, 2], [4, 5]])
m
matrix([[1, 2], [4, 5]])
np.linalg.eig(m) #Compute the eigenvalues and right eigenvectors of a square array
(array([-0.46410162, 6.46410162]), matrix([[-0.80689822, -0.34372377], [ 0.59069049, -0.9390708 ]]))
A simple plot with matplotlib
import numpy as nptime = np.linspace(0, 2* 365, 2* 365)temperature = 20+ 5* np.sin(2* np.pi/ 365* time)temperature= temperature + np.random.normal(size= 2* 365)
from matplotlib import pyplot as pltplt.plot(time, temperature, 'sb')plt.xlabel('time [days]')plt.ylabel('temperature')plt.title('Simulated temperature time series')plt.show()

plt.plot?
Bar plots
import numpy as npfrom matplotlib import pyplot as plt
x= np.array([1,2,3])y= np.array([3, 10, 5])plt.bar(x, y, color= 'red', edgecolor= 'blue', linewidth= 3)plt.show()

Histograms
import numpy as npfrom matplotlib import pyplot as plt
x= np.random.normal(10, 2, size= 10000)counts, breaks= np.histogram(x, bins= np.arange(0, 20, 0.5))len(breaks) #40len(counts) #39plt.bar(breaks[:-1], counts) #需要去除breaks中的最后一个值
<BarContainer object of 39 artists>

plt.hist(x, bins=np.arange(0, 20, 0.1), color= 'orange')plt.show()

Boxplot
import numpy as npfrom matplotlib import pyplot as plt
x= np.random.normal(10, 2, 1000)
plt.boxplot(x)plt.show()

data= [x, x[:100], x[10:]]plt.boxplot(data, labels= ['all', 'fisrt 100', 'rest'])plt.show()

Raster plots(光栅图) and contour(等高线) plots
import numpy as npfrom matplotlib import pyplot as plt
xx= np.repeat(np.linspace(-2, 2, 50), 50).reshape(50, 50)yy= np.transpose(xx)zz= np.exp(-xx**2- yy**2)
plt.matshow(zz, cmap= plt.cm.rainbow)plt.colorbar()plt.contour(zz, levels =3, colors= 'black')plt.show()

zz.shape
(50, 50)
Geographical maps
from matplotlib import pyplot as pltfrom mpl_toolkits.basemap import Basemap
map= Basemap(projection= 'merc', resolution= 'l', llcrnrlat=45, urcrnrlat= 60, llcrnrlon= -10, urcrnrlon= 10)map.drawcoastlines()map.drawcountries()map.fillcontinents(color= 'beige')map.drawmapboundary(fill_color= 'lightblue')map.scatter(x= 355, y= 50, latlon= True)plt.show()

from matplotlib import pyplot as pltplt.plot([1,4,2,6])plt.figure(figsize=[6,6])# plt.savefig('figure1.png')plt.show()

<Figure size 432x432 with 0 Axes>
Pandas
import pandas as pd
pd? # see the help document
Pandas Series
import pandas as pdimport numpy as np
s= pd.Series([1, 3, 5, np.nan, 6, 8])s
0 1.01 3.02 5.03 NaN4 6.05 8.0dtype: float64
s.values
array([ 1., 3., 5., nan, 6., 8.])
s.index
RangeIndex(start=0, stop=6, step=1)
s[4]
6.0
s[: 3]
0 1.01 3.02 5.0dtype: float64
s= pd.Series(data= [1, 2, 3], index= ['a', 'b', 'c'])s
a 1b 2c 3dtype: int64
s[2]
3
s['c']
3
s[0: 2]
a 1b 2dtype: int64
s['a':'c']
a 1b 2c 3dtype: int64
population= pd.Series( index= ['russia', 'turkey', 'germany', 'france', 'uk'], data= [146, 83, 82, 67, 66])
population
russia 146turkey 83germany 82france 67uk 66dtype: int64
population['uk']
66
Pandas DataFrames
import numpy as npimport pandas as pd
population= pd.Series( index= ['russia', 'turkey', 'germany'], data= [146, 83, 82])area= pd.Series( index= ['russia', 'germany', 'turkey'], data= [3995, 357, 783])
countries= pd.DataFrame({'population': population, 'area': area})
countries
|
population |
area |
germany |
82 |
357 |
russia |
146 |
3995 |
turkey |
83 |
783 |
Index Series
import pandas as pdimport numpy as np
s= pd.Series(data= [1, 2, 3], index= ['a', 'b', 'c'])
s.index
Index(['a', 'b', 'c'], dtype='object')
s.keys()
Index(['a', 'b', 'c'], dtype='object')
s[1]
2
s['b']
2
s.items()
<zip at 0x209a4e97288>
for index, val in s.items(): print('index '+ str(index)+ ' '+ ':: value '+ str(val))
index a :: value 1index b :: value 2index c :: value 3
s[0: 2]
a 1b 2dtype: int64
s['a': 'b']
a 1b 2dtype: int64
s[['a', 'c']]
a 1c 3dtype: int64
s[s>= 1]
a 1b 2c 3dtype: int64
s[s> 1]
b 2c 3dtype: int64
ss= pd.Series(index= [1, 3, 5], data= ['a', 'b', 'c'])ss
1 a3 b5 cdtype: object
ss[1] #explicit indexing
'a'
ss[0: 3] #implicit indexing
1 a3 b5 cdtype: object
ss.loc[1: 3] #explicit indexing
1 a3 bdtype: object
ss.iloc[1]
'b'
Indexing DataFrames
import pandas as pdimport numpy as np
population= pd.Series( index= ['russia', 'turkey', 'germany'], data= [146, 83, 82])area= pd.Series( index= ['russia', 'germany', 'turkey'], data= [3995, 357, 783])countries= pd.DataFrame({'population': population, 'area': area})countries
|
population |
area |
germany |
82 |
357 |
russia |
146 |
3995 |
turkey |
83 |
783 |
countries['area']
germany 357russia 3995turkey 783Name: area, dtype: int64
countries['germany': 'turkey']
|
population |
area |
germany |
82 |
357 |
russia |
146 |
3995 |
turkey |
83 |
783 |
countries['population': 'area']
countries.loc['turkey', :]
population 83area 783Name: turkey, dtype: int64
countries.loc['russia']
population 146area 3995Name: russia, dtype: int64
countries.loc['russia': 'turkey', 'population']
russia 146turkey 83Name: population, dtype: int64
countries.loc[:, 'area']
germany 357russia 3995turkey 783Name: area, dtype: int64
countries.iloc[0, 0]
82
countries.iloc[1] # extract the specific row
population 146area 3995Name: russia, dtype: int64
countries.iloc[:, 1] # all rows and area
germany 357russia 3995turkey 783Name: area, dtype: int64
Add/ Removing coloums
import pandas as pd
df= pd.DataFrame({'x':[1,2,3], 'y': ['a', 'b', 'c']})df
df['z']= [True, False, True]df
|
x |
y |
x_squared |
z |
0 |
1 |
a |
1 |
True |
1 |
2 |
b |
4 |
False |
2 |
3 |
c |
9 |
True |
df['x_squared']= df['x']** 2df
|
x |
y |
z |
x_squared |
0 |
1 |
a |
True |
1 |
1 |
2 |
b |
False |
4 |
2 |
3 |
c |
True |
9 |
df.drop('z', axis= 1) # col: axis= 1, row: axis= 0
|
x |
y |
x_squared |
0 |
1 |
a |
1 |
1 |
2 |
b |
4 |
2 |
3 |
c |
9 |
df # but the original data has not been manipulated
|
x |
y |
z |
x_squared |
0 |
1 |
a |
True |
1 |
1 |
2 |
b |
False |
4 |
2 |
3 |
c |
True |
9 |
df= df.drop('z', axis= 1)
df
|
x |
y |
x_squared |
0 |
1 |
a |
1 |
1 |
2 |
b |
4 |
2 |
3 |
c |
9 |
Filtering Rows
import numpy as npimport pandas as pd
df= pd.DataFrame({'x': np.random.normal(size= 10)})df
|
x |
0 |
0.862766 |
1 |
-1.622989 |
2 |
0.211018 |
3 |
1.000474 |
4 |
0.568656 |
5 |
1.021812 |
6 |
0.424689 |
7 |
0.273512 |
8 |
-0.913785 |
9 |
0.599780 |
df.drop(0, axis= 0) # remove the first row, not permanently
|
x |
1 |
-1.622989 |
2 |
0.211018 |
3 |
1.000474 |
4 |
0.568656 |
5 |
1.021812 |
6 |
0.424689 |
7 |
0.273512 |
8 |
-0.913785 |
9 |
0.599780 |
df.drop([1, 2, 3], inplace= True) # remove these rows permanently: set inpalce= True
df
|
x |
0 |
0.862766 |
4 |
0.568656 |
5 |
1.021812 |
6 |
0.424689 |
7 |
0.273512 |
8 |
-0.913785 |
9 |
0.599780 |
df[(df['x']> 0) & (df['x']< 1)] # positive values
|
x |
0 |
0.862766 |
4 |
0.568656 |
6 |
0.424689 |
7 |
0.273512 |
9 |
0.599780 |
df.query('x> 0 & x< 1')
|
x |
0 |
0.862766 |
4 |
0.568656 |
6 |
0.424689 |
7 |
0.273512 |
9 |
0.599780 |
Combining , merging, joining, DataFrames
import pandas as pd
df1= pd.DataFrame({'x': ['a', 'b']}, index= [1, 2])df2= pd.DataFrame({'x': ['c', 'd']}, index= [3, 4])pd.concat([df1, df2])
df3= pd.DataFrame({'x': ['a', 'b']}, index= [1, 2])df4= pd.DataFrame({'y': ['c', 'd']}, index= [1, 2])pd.concat([df3, df4], axis= 1) # axis= 0,merge them among the rows; axis= 1,merge them among the cols
df5= pd.DataFrame({'x': ['a', 'b']}, index= [1, 2])df6= pd.DataFrame({'y': ['c', 'd']}, index= [3, 4])pd.concat([df3, df4])
D:\Users\Howell.L\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: Sorting because non-concatenation axis is not aligned. A future versionof pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'. This is separate from the ipykernel package so we can avoid doing imports until
|
x |
y |
1 |
a |
NaN |
2 |
b |
NaN |
1 |
NaN |
c |
2 |
NaN |
d |
Aggregates and group aggregates
import pandas as pdimport numpy as np
df= pd.DataFrame({'product': ['car', 'car', 'fruit', 'fruit'], 'price': [5000, 20000, 0.5, 1.2]})df
|
product |
price |
0 |
car |
5000.0 |
1 |
car |
20000.0 |
2 |
fruit |
0.5 |
3 |
fruit |
1.2 |
df['product'].value_counts()
fruit 2car 2Name: product, dtype: int64
df['price'].agg([np.mean, np.min, np.max])
mean 6250.425amin 0.500amax 20000.000Name: price, dtype: float64
df.groupby('product')['price'].agg([np.mean, np.min, np.max])
|
mean |
amin |
amax |
product |
|
|
|
car |
12500.00 |
5000.0 |
20000.0 |
fruit |
0.85 |
0.5 |
1.2 |
Dates and times
import pandas as pd
dates= pd.to_datetime(['2021-01-01 16:20', '1st of Jan, 2021', 'today'])
dates
DatetimeIndex(['2021-01-01 16:20:00', '2021-01-01 00:00:00', '2021-05-16 21:10:11.326783'], dtype='datetime64[ns]', freq=None)
dates.strftime('%Y-%m-%d')
Index(['2021-01-01', '2021-01-01', '2021-05-16'], dtype='object')
dates.strftime('%Y-%m-%d %H:%M')
Index(['2021-01-01 16:20', '2021-01-01 00:00', '2021-05-16 21:10'], dtype='object')
ts= pd.Series(data= [2, 1, 4], index= pd.to_datetime(['2020-01-01', '2021/01/01', '1st Jan 2022']))ts
2020-01-01 22021-01-01 12022-01-01 4dtype: int64
from matplotlib import pyplot as pltplt.figure(figsize= [10, 4])plt.plot(ts)plt.show()
C:\Users\Howell.L\AppData\Roaming\Python\Python37\site-packages\pandas\plotting\_matplotlib\converter.py:103: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.To register the converters: >>> from pandas.plotting import register_matplotlib_converters >>> register_matplotlib_converters() warnings.warn(msg, FutureWarning)
