Python数据分析入门之基础知识、numpy、matplotlib和pandas

Basic knowledge

conditionals

if
elif
else

mark= 56
if mark>= 69.5:
    print("distribution")
elif mark>= 59.5:
    print("merit")
elif mark>= 50.0:
    print("pass")
else:
    print("Fail")

pass

Loops

for
while
break

numbers= [1,2,3,4,5,6]

for i in numbers:
    print(i)

number_plus_one= []

number_plus_one= []
for i in numbers:
    number_plus_one.append(i+1)
    print("current number is: "+ str(i))

current number is: 1
current number is: 2
current number is: 3
current number is: 4
current number is: 5
current number is: 6

number_plus_one

[2, 3, 4, 5, 6, 7]

for i in range(1, 8, 2):
    print(i)

count= 0while count< 5:    print(count)    count+= 1

count= 0while True:    print(count)    count+= 1    if count>= 5:        break

Functions

def my_func():    print("Hello from my func")

my_func()

Hello from my func

def my_func2(greeting, planet):    print(greeting+ ' '+ planet+ '!')

my_func2('Hello', 'World')

Hello World!

def add(a, b):    return a+ b

result= add(1, 2)print(result)

result

Python packages, and the `numpy` package

import numpy as np

np.identity(5)

array([[1., 0., 0., 0., 0.],       [0., 1., 0., 0., 0.],       [0., 0., 1., 0., 0.],       [0., 0., 0., 1., 0.],       [0., 0., 0., 0., 1.]])

from numpy import identityidentity(5)

array([[1., 0., 0., 0., 0.],       [0., 1., 0., 0., 0.],       [0., 0., 1., 0., 0.],       [0., 0., 0., 1., 0.],       [0., 0., 0., 0., 1.]])

from matplotlib import pyplot as plt

plt.plot([1,2,3],[1,4,1],'or')

[<matplotlib.lines.Line2D at 0x208b1c47780>]

png

Numpy arrays

import numpy as np

arr= np.array([1.2, 3.14, -6.45])

arr

array([ 1.2 ,  3.14, -6.45])

2* [23,42]

[23, 42, 23, 42]

2* arr

array([  2.4 ,   6.28, -12.9 ])

arr+ 1

array([ 2.2 ,  4.14, -5.45])

arr* -1

array([-1.2 , -3.14,  6.45])

arr** 2

array([ 1.44  ,  9.8596, 41.6025])

arr1= arrarr2= np.array([1, 2, 3])

print([1, 2 ,3]+ [3, 4, 5]) #listarr1+ arr2 #numpy array

[1, 2, 3, 3, 4, 5]





array([ 2.2 ,  5.14, -3.45])

arr1* arr2

array([  1.2 ,   6.28, -19.35])

np.sin(1)

0.8414709848078965

np.sin(arr1)

array([ 0.93203909,  0.00159265, -0.16604211])

arr1[0]

1.2

arr[: 2]

array([1.2 , 3.14])

Numpy arrays pt 2

import numpy as np

a= np.array([[1, 2, 3],[4, 5, 6]])

array([[1, 2, 3],       [4, 5, 6]])

a.shape #get the number of rows and cols

(2, 3)

nrows, ncols= a.shapeprint(nrows, ncols)

2 3

a.shape= 6

array([1, 2, 3, 4, 5, 6])

a.shape= [3, 2]

array([[1, 2],       [3, 4],       [5, 6]])

a.shape= [2, 3]

array([[1, 2, 3],       [4, 5, 6]])

np.zeros(5)

array([0., 0., 0., 0., 0.])

np.zeros([3, 4])

array([[0., 0., 0., 0.],       [0., 0., 0., 0.],       [0., 0., 0., 0.]])

np.ones([3, 4])

array([[1., 1., 1., 1.],       [1., 1., 1., 1.],       [1., 1., 1., 1.]])

np.arange(0.1, 0.2, 0.01) #start, stop, step

array([0.1 , 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19])

np.linspace(0.1, 0.2, 15) #rather than the step, the third one is the numbers of  nums between the start and the stop

array([0.1       , 0.10714286, 0.11428571, 0.12142857, 0.12857143,       0.13571429, 0.14285714, 0.15      , 0.15714286, 0.16428571,       0.17142857, 0.17857143, 0.18571429, 0.19285714, 0.2       ])

a= np.array([1, 2, 3])b= np.array([4, 5, 6])

np.hstack([a, b]) #horizentally

array([1, 2, 3, 4, 5, 6])

np.vstack([a, b]) #vertically

array([[1, 2, 3],       [4, 5, 6]])

More `numpy` features (other than arrays)

import numpy as np

np.random.normal(1, 2, 5)# (mean, sd, scale)

array([0.26692905, 3.25555715, 5.55266956, 1.23445221, 1.85074104])

a= np.random.normal(1, 2, 10)

array([ 1.61286433, -2.2266842 , -2.06336836,  1.70629078, -0.01086119,        0.9961519 ,  0.42980328,  4.31010475, -0.07291085, -2.21994622])

np.average(a)

0.2461444230839865

np.var(a)

3.8810147907170185

b= np.random.normal(0, 5, [5, 3])

array([[  4.14403725,   4.93392806, -12.57708917],       [  3.74246174,   1.43706417,  -3.8630664 ],       [  1.94530969,   6.6532378 ,  -3.39473863],       [  5.91008478,  -8.03735424,   7.02867261],       [  4.88898932,   1.91366109,   3.04734046]])

np.average(b)

1.1848359007671332

np.average(b, axis= 0) # axis= 0 :row

array([ 4.12617655,  1.38010737, -1.95177623])

np.average(b, axis= 1) # axis= 1: col

array([-1.16637462,  0.43881984,  1.73460295,  1.63380105,  3.28333029])

m= np.matrix([[1, 2], [4, 5]])

matrix([[1, 2],        [4, 5]])

np.linalg.eig(m) #Compute the eigenvalues and right eigenvectors of a square array

(array([-0.46410162,  6.46410162]), matrix([[-0.80689822, -0.34372377],         [ 0.59069049, -0.9390708 ]]))

A simple plot with matplotlib

import numpy as nptime = np.linspace(0, 2* 365, 2* 365)temperature = 20+ 5* np.sin(2* np.pi/ 365* time)temperature= temperature + np.random.normal(size= 2* 365)

from matplotlib import pyplot as pltplt.plot(time, temperature, 'sb')plt.xlabel('time [days]')plt.ylabel('temperature')plt.title('Simulated temperature time series')plt.show()

png

plt.plot?

Bar plots

import numpy as npfrom matplotlib import pyplot as plt

x= np.array([1,2,3])y= np.array([3, 10, 5])plt.bar(x, y, color= 'red', edgecolor= 'blue', linewidth= 3)plt.show()

png

Histograms

import numpy as npfrom matplotlib import pyplot as plt

x= np.random.normal(10, 2, size= 10000)counts, breaks= np.histogram(x, bins= np.arange(0, 20, 0.5))len(breaks) #40len(counts) #39plt.bar(breaks[:-1], counts) #需要去除breaks中的最后一个值

<BarContainer object of 39 artists>

png

plt.hist(x, bins=np.arange(0, 20, 0.1), color= 'orange')plt.show()

png

Boxplot

import numpy as npfrom matplotlib import pyplot as plt

x= np.random.normal(10, 2, 1000)

plt.boxplot(x)plt.show()

png

data= [x, x[:100], x[10:]]plt.boxplot(data, labels= ['all', 'fisrt 100', 'rest'])plt.show()

png

Raster plots(光栅图) and contour（等高线） plots

import numpy as npfrom matplotlib import pyplot as plt

xx= np.repeat(np.linspace(-2, 2, 50), 50).reshape(50, 50)yy= np.transpose(xx)zz= np.exp(-xx**2- yy**2)

plt.matshow(zz, cmap= plt.cm.rainbow)plt.colorbar()plt.contour(zz, levels =3, colors= 'black')plt.show()

png

zz.shape

(50, 50)

Geographical maps

from matplotlib import pyplot as pltfrom mpl_toolkits.basemap import Basemap

map= Basemap(projection= 'merc', resolution= 'l', llcrnrlat=45, urcrnrlat= 60, llcrnrlon= -10, urcrnrlon= 10)map.drawcoastlines()map.drawcountries()map.fillcontinents(color= 'beige')map.drawmapboundary(fill_color= 'lightblue')map.scatter(x= 355, y= 50, latlon= True)plt.show()

png

Resizing and saving figures

from matplotlib import pyplot as pltplt.plot([1,4,2,6])plt.figure(figsize=[6,6])# plt.savefig('figure1.png')plt.show()

png

<Figure size 432x432 with 0 Axes>

Pandas

import pandas as pd

pd? # see the help document

Pandas Series

import pandas as pdimport numpy as np

s= pd.Series([1, 3, 5, np.nan, 6, 8])s

0    1.01    3.02    5.03    NaN4    6.05    8.0dtype: float64

s.values

array([ 1.,  3.,  5., nan,  6.,  8.])

s.index

RangeIndex(start=0, stop=6, step=1)

s[4]

6.0

s[: 3]

0    1.01    3.02    5.0dtype: float64

s= pd.Series(data= [1, 2, 3], index= ['a', 'b', 'c'])s

a    1b    2c    3dtype: int64

s[2]

s['c']

s[0: 2]

a    1b    2dtype: int64

s['a':'c']

a    1b    2c    3dtype: int64

population= pd.Series(    index= ['russia', 'turkey', 'germany', 'france', 'uk'],    data= [146, 83, 82, 67, 66])

population

russia     146turkey      83germany     82france      67uk          66dtype: int64

population['uk']

Pandas DataFrames

import numpy as npimport pandas as pd

population= pd.Series(    index= ['russia', 'turkey', 'germany'],    data= [146, 83, 82])area= pd.Series(    index= ['russia', 'germany', 'turkey'],    data= [3995, 357, 783])

countries= pd.DataFrame({'population': population, 'area': area})

countries

	population	area
germany	82	357
russia	146	3995
turkey	83	783

Index Series

import pandas as pdimport numpy as np

s= pd.Series(data= [1, 2, 3], index= ['a', 'b', 'c'])

s.index

Index(['a', 'b', 'c'], dtype='object')

s.keys()

Index(['a', 'b', 'c'], dtype='object')

s[1]

s['b']

s.items()

<zip at 0x209a4e97288>

for index, val in s.items():    print('index '+ str(index)+ ' '+ ':: value '+ str(val))

index a :: value 1index b :: value 2index c :: value 3

s[0: 2]

a    1b    2dtype: int64

s['a': 'b']

a    1b    2dtype: int64

s[['a', 'c']]

a    1c    3dtype: int64

s[s>= 1]

a    1b    2c    3dtype: int64

s[s> 1]

b    2c    3dtype: int64

ss= pd.Series(index= [1, 3, 5], data= ['a', 'b', 'c'])ss

1    a3    b5    cdtype: object

ss[1] #explicit indexing

'a'

ss[0: 3] #implicit  indexing

1    a3    b5    cdtype: object

ss.loc[1: 3] #explicit indexing

1    a3    bdtype: object

ss.iloc[1]

'b'

Indexing DataFrames

import pandas as pdimport numpy as np

population= pd.Series(    index= ['russia', 'turkey', 'germany'],    data= [146, 83, 82])area= pd.Series(    index= ['russia', 'germany', 'turkey'],    data= [3995, 357, 783])countries= pd.DataFrame({'population': population, 'area': area})countries

	population	area
germany	82	357
russia	146	3995
turkey	83	783

countries['area']

germany     357russia     3995turkey      783Name: area, dtype: int64

countries['germany': 'turkey']

	population	area
germany	82	357
russia	146	3995
turkey	83	783

countries['population': 'area']

	population	area

countries.loc['turkey', :]

population     83area          783Name: turkey, dtype: int64

countries.loc['russia']

population     146area          3995Name: russia, dtype: int64

countries.loc['russia': 'turkey', 'population']

russia    146turkey     83Name: population, dtype: int64

countries.loc[:, 'area']

germany     357russia     3995turkey      783Name: area, dtype: int64

countries.iloc[0, 0]

countries.iloc[1] # extract the specific row

population     146area          3995Name: russia, dtype: int64

countries.iloc[:, 1] # all rows and area

germany     357russia     3995turkey      783Name: area, dtype: int64

Add/ Removing coloums

import pandas as pd

df= pd.DataFrame({'x':[1,2,3], 'y': ['a', 'b', 'c']})df

	x	y
0	1	a
1	2	b
2	3	c

df['z']= [True, False, True]df

	x	y	x_squared	z
0	1	a	1	True
1	2	b	4	False
2	3	c	9	True

df['x_squared']= df['x']** 2df

	x	y	z	x_squared
0	1	a	True	1
1	2	b	False	4
2	3	c	True	9

df.drop('z', axis= 1) # col: axis= 1, row: axis= 0

	x	y	x_squared
0	1	a	1
1	2	b	4
2	3	c	9

df # but the original data has not been manipulated

	x	y	z	x_squared
0	1	a	True	1
1	2	b	False	4
2	3	c	True	9

df= df.drop('z', axis= 1)

df

	x	y	x_squared
0	1	a	1
1	2	b	4
2	3	c	9

Filtering Rows

import numpy as npimport pandas as pd

df= pd.DataFrame({'x': np.random.normal(size= 10)})df

	x
0	0.862766
1	-1.622989
2	0.211018
3	1.000474
4	0.568656
5	1.021812
6	0.424689
7	0.273512
8	-0.913785
9	0.599780

df.drop(0, axis= 0) # remove the first row, not permanently

	x
1	-1.622989
2	0.211018
3	1.000474
4	0.568656
5	1.021812
6	0.424689
7	0.273512
8	-0.913785
9	0.599780

df.drop([1, 2, 3], inplace= True) # remove these rows permanently: set inpalce= True

df

	x
0	0.862766
4	0.568656
5	1.021812
6	0.424689
7	0.273512
8	-0.913785
9	0.599780

df[(df['x']> 0) & (df['x']< 1)] # positive values

	x
0	0.862766
4	0.568656
6	0.424689
7	0.273512
9	0.599780

df.query('x> 0 & x< 1')

	x
0	0.862766
4	0.568656
6	0.424689
7	0.273512
9	0.599780

Combining , merging, joining, DataFrames

import pandas as pd

df1= pd.DataFrame({'x': ['a', 'b']}, index= [1, 2])df2= pd.DataFrame({'x': ['c', 'd']}, index= [3, 4])pd.concat([df1, df2])

	x
1	a
2	b
3	c
4	d

df3= pd.DataFrame({'x': ['a', 'b']}, index= [1, 2])df4= pd.DataFrame({'y': ['c', 'd']}, index= [1, 2])pd.concat([df3, df4], axis= 1) # axis= 0,merge them among the rows; axis= 1,merge them among the cols

	x	y
1	a	c
2	b	d

df5= pd.DataFrame({'x': ['a', 'b']}, index= [1, 2])df6= pd.DataFrame({'y': ['c', 'd']}, index= [3, 4])pd.concat([df3, df4])

D:\Users\Howell.L\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: Sorting because non-concatenation axis is not aligned. A future versionof pandas will change to not sort by default.To accept the future behavior, pass 'sort=False'.To retain the current behavior and silence the warning, pass 'sort=True'.  This is separate from the ipykernel package so we can avoid doing imports until

	x	y
1	a	NaN
2	b	NaN
1	NaN	c
2	NaN	d

Aggregates and group aggregates

import pandas as pdimport numpy as np

df= pd.DataFrame({'product': ['car', 'car', 'fruit', 'fruit'], 'price': [5000, 20000, 0.5, 1.2]})df

	product	price
0	car	5000.0
1	car	20000.0
2	fruit	0.5
3	fruit	1.2

df['product'].value_counts()

fruit    2car      2Name: product, dtype: int64

df['price'].agg([np.mean, np.min, np.max])

mean     6250.425amin        0.500amax    20000.000Name: price, dtype: float64

df.groupby('product')['price'].agg([np.mean, np.min, np.max])

	mean	amin	amax
product
car	12500.00	5000.0	20000.0
fruit	0.85	0.5	1.2

Dates and times

import pandas as pd

dates= pd.to_datetime(['2021-01-01 16:20', '1st of Jan, 2021', 'today'])

dates

DatetimeIndex(['2021-01-01 16:20:00', '2021-01-01 00:00:00',               '2021-05-16 21:10:11.326783'],              dtype='datetime64[ns]', freq=None)

dates.strftime('%Y-%m-%d')

Index(['2021-01-01', '2021-01-01', '2021-05-16'], dtype='object')

dates.strftime('%Y-%m-%d %H:%M')

Index(['2021-01-01 16:20', '2021-01-01 00:00', '2021-05-16 21:10'], dtype='object')

ts= pd.Series(data= [2, 1, 4], index= pd.to_datetime(['2020-01-01', '2021/01/01', '1st Jan 2022']))ts

2020-01-01    22021-01-01    12022-01-01    4dtype: int64

from matplotlib import pyplot as pltplt.figure(figsize= [10, 4])plt.plot(ts)plt.show()

C:\Users\Howell.L\AppData\Roaming\Python\Python37\site-packages\pandas\plotting\_matplotlib\converter.py:103: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.To register the converters:	>>> from pandas.plotting import register_matplotlib_converters	>>> register_matplotlib_converters()  warnings.warn(msg, FutureWarning)

png

posted @ 2021-05-16 21:33 MRWH7 阅读(128) 评论(0) 收藏举报

刷新页面返回顶部

MRWH

Python数据分析入门之基础知识、numpy、matplotlib和pandas

Basic knowledge

conditionals

Loops

Functions

Python packages, and the numpy package

Numpy arrays

Numpy arrays pt 2

More numpy features (other than arrays)

A simple plot with matplotlib

Bar plots

Histograms

Boxplot

Raster plots(光栅图) and contour（等高线） plots

Geographical maps

Resizing and saving figures

Pandas

Pandas Series

Pandas DataFrames

Index Series

Indexing DataFrames

Add/ Removing coloums

Filtering Rows

Combining , merging, joining, DataFrames

Aggregates and group aggregates

Dates and times

公告

Python packages, and the `numpy` package

More `numpy` features (other than arrays)