数据分析处理库pandas

老唐数据分析机器学习
# pandas_1

import pandas
food_info = pandas.read_csv("food_info.csv")
#print(type(food_info))
print (food_info.dtypes)

'''
NDB_No               int64
Shrt_Desc           object
Water_(g)          float64
Energ_Kcal           int64
Protein_(g)        float64
Lipid_Tot_(g)      float64
Ash_(g)            float64
Carbohydrt_(g)     float64
Fiber_TD_(g)       float64
Sugar_Tot_(g)      float64
Calcium_(mg)       float64
Iron_(mg)          float64
Magnesium_(mg)     float64
Phosphorus_(mg)    float64
Potassium_(mg)     float64
Sodium_(mg)        float64
Zinc_(mg)          float64
Copper_(mg)        float64
Manganese_(mg)     float64
Selenium_(mcg)     float64
Vit_C_(mg)         float64
Thiamin_(mg)       float64
Riboflavin_(mg)    float64
Niacin_(mg)        float64
Vit_B6_(mg)        float64
Vit_B12_(mcg)      float64
Vit_A_IU           float64
Vit_A_RAE          float64
Vit_E_(mg)         float64
Vit_D_mcg          float64
Vit_D_IU           float64
Vit_K_(mcg)        float64
FA_Sat_(g)         float64
FA_Mono_(g)        float64
FA_Poly_(g)        float64
Cholestrl_(mg)     float64
dtype: object
'''

#first_rows = food_info.head()
#print first_rows
#print(food_info.head(3))
#print food_info.columns
#print food_info.shape

#pandas uses zero-indexing
#Series object representing the row at index 0.
#print food_info.loc[0]

# Series object representing the seventh row.
#food_info.loc[6]

# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
#food_info.loc[8620]
#The object dtype is equivalent to a string in Python

#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
#print(food_info.dtypes)

# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
#food_info.loc[3:6]

# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
#two_five_ten = [2,5,10] 
#food_info.loc[two_five_ten]

# Method 2
#food_info.loc[[2,5,10]]

# Series object representing the "NDB_No" column.
#ndb_col = food_info["NDB_No"]
#print ndb_col
# Alternatively, you can access a column by passing in a string variable.
#col_name = "NDB_No"
#ndb_col = food_info[col_name]

#columns = ["Zinc_(mg)", "Copper_(mg)"]
#zinc_copper = food_info[columns]
#print zinc_copper
#print zinc_copper
# Skipping the assignment.
#zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]

#print(food_info.columns)
#print(food_info.head(2))
col_names = food_info.columns.tolist()
#print col_names
gram_columns = []

for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))

'''
Water_(g)  Protein_(g)  Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  \
0      15.87         0.85          81.11     2.11            0.06   
1      15.87         0.85          81.11     2.11            0.06   
2       0.24         0.28          99.48     0.00            0.00   

   Fiber_TD_(g)  Sugar_Tot_(g)  FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  
0           0.0           0.06      51.368       21.021        3.043  
1           0.0           0.06      50.489       23.426        3.012  
2           0.0           0.00      61.924       28.732        3.694  
'''

# pandas_2

import pandas
food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
print(food_info.head(3))
'''
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
   NDB_No                 Shrt_Desc  Water_(g)  Energ_Kcal  Protein_(g)  \
0    1001          BUTTER WITH SALT      15.87         717         0.85   
1    1002  BUTTER WHIPPED WITH SALT      15.87         717         0.85   
2    1003      BUTTER OIL ANHYDROUS       0.24         876         0.28   

   Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  Fiber_TD_(g)  Sugar_Tot_(g)  ...  \
0          81.11     2.11            0.06           0.0           0.06  ...   
1          81.11     2.11            0.06           0.0           0.06  ...   
2          99.48     0.00            0.00           0.0           0.00  ...   

   Vit_A_IU  Vit_A_RAE  Vit_E_(mg)  Vit_D_mcg  Vit_D_IU  Vit_K_(mcg)  \
0    2499.0      684.0        2.32        1.5      60.0          7.0   
1    2499.0      684.0        2.32        1.5      60.0          7.0   
2    3069.0      840.0        2.80        1.8      73.0          8.6   

   FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  Cholestrl_(mg)  
0      51.368       21.021        3.043           215.0  
1      50.489       23.426        3.012           219.0  
2      61.924       28.732        3.694           256.0  

[3 rows x 36 columns]
'''

#print food_info["Iron_(mg)"]
#div_1000 = food_info["Iron_(mg)"] / 1000
#print div_1000
# Adds 100 to each value in the column and returns a Series object.
#add_100 = food_info["Iron_(mg)"] + 100

# Subtracts 100 from each value in the column and returns a Series object.
#sub_100 = food_info["Iron_(mg)"] - 100

# Multiplies each value in the column by 2 and returns a Series object.
#mult_2 = food_info["Iron_(mg)"]*2


#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000  
food_info["Iron_(g)"] = iron_grams

#Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat

# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result, 
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat

#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
#print food_info["Sodium_(mg)"]
food_info.sort_values("Sodium_(mg)", inplace=True)
print (food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print (food_info["Sodium_(mg)"])
'''
760     0.0
758     0.0
405     0.0
761     0.0
2269    0.0
       ... 
8184    NaN
8185    NaN
8195    NaN
8251    NaN
8267    NaN
Name: Sodium_(mg), Length: 8618, dtype: float64
276     38758.0
5814    27360.0
6192    26050.0
1242    26000.0
1245    24000.0
         ...   
8184        NaN
8185        NaN
8195        NaN
8251        NaN
8267        NaN
Name: Sodium_(mg), Length: 8618, dtype: float64
'''

# pandas_3

import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()

#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
# print(age.loc[0:10])
age_is_null = pd.isnull(age)
# print (age_is_null)
age_null_true = age[age_is_null]
print (age_null_true)
age_null_count = len(age_null_true)
print(age_null_count) 
'''
5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
       ..
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: Age, Length: 177, dtype: float64
177
'''

#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print (mean_age)
'''
nan
'''

#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)
'''
29.69911764705882
'''

# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age)
'''
29.69911764705882
'''

#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print (fares_by_class)
'''
{1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
'''

#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print (passenger_survival)
'''
          Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
'''

passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)
'''
              Age
Pclass           
1       38.233441
2       29.877630
3       25.140620
'''    

port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
'''
                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217
'''

#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])

print (drop_na_columns.shape)
'''
(891, 9)
'''

row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
'''
28.0
1
'''

new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print (new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True) # (drop=True) 表示原来的索引不要了，生成新的索引
print(titanic_reindexed.iloc[0:10])
'''
     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0      0      347060   7.7750   NaN        S  
493  male  71.0      0      0    PC 17609  49.5042   NaN        C  
96   male  71.0      0      0    PC 17754  34.6542    A5        C  
116  male  70.5      0      0      370369   7.7500   NaN        Q  
672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S  
745  male  70.0      1      1   WE/P 5735  71.0000   B22        S  
33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S  
54   male  65.0      0      1      113509  61.9792   B30        C  
280  male  65.0      0      0      336439   7.7500   NaN        Q  
   PassengerId  Survived  Pclass                                  Name   Sex  \
0          631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1          852         0       3                   Svensson, Mr. Johan  male   
2          494         0       1               Artagaveytia, Mr. Ramon  male   
3           97         0       1             Goldschmidt, Mr. George B  male   
4          117         0       3                  Connors, Mr. Patrick  male   
5          673         0       2           Mitchell, Mr. Henry Michael  male   
6          746         0       1          Crosby, Capt. Edward Gifford  male   
7           34         0       2                 Wheadon, Mr. Edward H  male   
8           55         0       1        Ostby, Mr. Engelhart Cornelius  male   
9          281         0       3                      Duane, Mr. Frank  male   

    Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0  80.0      0      0       27042  30.0000   A23        S  
1  74.0      0      0      347060   7.7750   NaN        S  
2  71.0      0      0    PC 17609  49.5042   NaN        C  
3  71.0      0      0    PC 17754  34.6542    A5        C  
4  70.5      0      0      370369   7.7500   NaN        Q  
5  70.0      0      0  C.A. 24580  10.5000   NaN        S  
6  70.0      1      1   WE/P 5735  71.0000   B22        S  
7  66.0      0      0  C.A. 24579  10.5000   NaN        S  
8  65.0      0      1      113509  61.9792   B30        C  
9  65.0      0      0      336439   7.7500   NaN        Q  
'''

# This function returns the hundredth item from a series
def hundredth_row(column):
    # Extract the hundredth item
    hundredth_item = column.iloc[99]
    return hundredth_item

# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print (hundredth_row)
'''
PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object
'''

# 判断每列中缺失值个数
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

column_null_count = titanic_survival.apply(not_null_count)
print (column_null_count)
'''
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
age_labels       0
dtype: int64
'''

# len(titanic_survival[pd.isnull(titanic_survival)])
# titanic_survival

#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"

classes = titanic_survival.apply(which_class, axis=1)
print (classes)
'''
0       Third Class
1       First Class
2       Third Class
3       First Class
4       Third Class
           ...     
886    Second Class
887     First Class
888     Third Class
889     First Class
890     Third Class
Length: 891, dtype: object
'''

def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False

minors = titanic_survival.apply(is_minor, axis=1)
#print minors
# 离散化
def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(generate_age_label, axis=1)
print (age_labels)
'''
0        adult
1        adult
2        adult
3        adult
4        adult
        ...   
886      adult
887      adult
888    unknown
889      adult
890      adult
Length: 891, dtype: object
'''

titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival)
'''
            Survived
age_labels          
adult       0.381032
minor       0.539823
unknown     0.293785
'''

# pandas_4

#Series (collection of values)
#DataFrame (collection of Series objects)
#Panel (collection of DataFrame objects)

#A Series object can hold many data types, including
#float - for representing float values
#int - for representing integer values
#bool - for representing Boolean values
#datetime64[ns] - for representing date & time, without time-zone
#datetime64[ns, tz] - for representing date & time, with time-zone
#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)
#category - for representing categorical values
#object - for representing String values

#FILM - film name
#RottenTomatoes - Rotten Tomatoes critics average score
#RottenTomatoes_User - Rotten Tomatoes user average score
#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)
#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)
#Metacritic - Metacritic critics average score
#Metacritic_User - Metacritic user average score

import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(type(series_film))
print('=========================')
print(series_film[0:5])
print('=========================')
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])
'''
<class 'pandas.core.series.Series'>
=========================
0    Avengers: Age of Ultron (2015)
1                 Cinderella (2015)
2                    Ant-Man (2015)
3            Do You Believe? (2015)
4     Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
=========================
0    74
1    85
2    80
3    18
4    14
Name: RottenTomatoes, dtype: int64
'''

fandango.head()
'''
    FILM    RottenTomatoes    RottenTomatoes_User    Metacritic    Metacritic_User    IMDB    Fandango_Stars    Fandango_Ratingvalue    RT_norm    RT_user_norm    ...    IMDB_norm    RT_norm_round    RT_user_norm_round    Metacritic_norm_round    Metacritic_user_norm_round    IMDB_norm_round    Metacritic_user_vote_count    IMDB_user_vote_count    Fandango_votes    Fandango_Difference
0    Avengers: Age of Ultron (2015)    74    86    66    7.1    7.8    5.0    4.5    3.70    4.3    ...    3.90    3.5    4.5    3.5    3.5    4.0    1330    271107    14846    0.5
1    Cinderella (2015)    85    80    67    7.5    7.1    5.0    4.5    4.25    4.0    ...    3.55    4.5    4.0    3.5    4.0    3.5    249    65709    12640    0.5
2    Ant-Man (2015)    80    90    64    8.1    7.8    5.0    4.5    4.00    4.5    ...    3.90    4.0    4.5    3.0    4.0    4.0    627    103660    12055    0.5
3    Do You Believe? (2015)    18    84    22    4.7    5.4    5.0    4.5    0.90    4.2    ...    2.70    1.0    4.0    1.0    2.5    2.5    31    3136    1793    0.5
4    Hot Tub Time Machine 2 (2015)    14    28    29    3.4    5.1    3.5    3.0    0.70    1.4    ...    2.55    0.5    1.5    1.5    1.5    2.5    88    19560    1021    0.5
5 rows × 22 columns
'''

# fandango.loc[[0,1],['FILM','RottenTomatoes']]
# fandango.FILM[0]
fandango.iloc[1,2]
'''
80
'''

# Import the Series object from pandas
from pandas import Series

film_names = series_film.values
print (type(film_names))
# print (film_names)
#print film_names
rt_scores = series_rt.values
#print (rt_scores)
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
'''
<class 'numpy.ndarray'>
Minions (2015)      54
Leviathan (2014)    99
dtype: int64
'''

# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
fiveten = series_custom[5:10]
print(fiveten)
'''
Minions (2015)      54
Leviathan (2014)    99
dtype: int64
The Water Diviner (2015)        63
Irrational Man (2015)           42
Top Five (2014)                 86
Shaun the Sheep Movie (2015)    99
Love & Mercy (2015)             89
dtype: int64
'''

original_index = series_custom.index.tolist()
# print(original_index)
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
print (sorted_by_index)
'''
'71 (2015)                          97
5 Flights Up (2015)                 52
A Little Chaos (2015)               40
A Most Violent Year (2014)          90
About Elly (2015)                   97
                                    ..
What We Do in the Shadows (2015)    96
When Marnie Was There (2015)        89
While We're Young (2015)            83
Wild Tales (2014)                   96
Woman in Gold (2015)                52
Length: 146, dtype: int64
'''

sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
#print(sc2[0:10])
print(sc3[0:10])
'''
Paul Blart: Mall Cop 2 (2015)     5
Hitman: Agent 47 (2015)           7
Hot Pursuit (2015)                8
Fantastic Four (2015)             9
Taken 3 (2015)                    9
The Boy Next Door (2015)         10
The Loft (2015)                  11
Unfinished Business (2015)       11
Mortdecai (2015)                 12
Seventh Son (2015)               12
dtype: int64
'''

#The values in a Series object are treated as an ndarray, the core data type in NumPy
import numpy as np
# Add each value with each other
print (np.add(series_custom, series_custom))
# Apply sine function to each value
np.sin(series_custom)
# Return the highest value (will return a single value not a Series)
np.max(series_custom)
'''
Avengers: Age of Ultron (2015)               148
Cinderella (2015)                            170
Ant-Man (2015)                               160
Do You Believe? (2015)                        36
Hot Tub Time Machine 2 (2015)                 28
                                            ... 
Mr. Holmes (2015)                            174
'71 (2015)                                   194
Two Days, One Night (2014)                   194
Gett: The Trial of Viviane Amsalem (2015)    200
Kumiko, The Treasure Hunter (2015)           174
Length: 146, dtype: int64

100
'''

#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]

criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)
'''
Avengers: Age of Ultron (2015)                                            74
The Water Diviner (2015)                                                  63
Unbroken (2014)                                                           51
Southpaw (2015)                                                           59
Insidious: Chapter 3 (2015)                                               59
The Man From U.N.C.L.E. (2015)                                            68
Run All Night (2015)                                                      60
5 Flights Up (2015)                                                       52
Welcome to Me (2015)                                                      71
Saint Laurent (2015)                                                      51
Maps to the Stars (2015)                                                  60
Pitch Perfect 2 (2015)                                                    67
The Age of Adaline (2015)                                                 54
The DUFF (2015)                                                           71
Ricki and the Flash (2015)                                                64
Unfriended (2015)                                                         60
American Sniper (2015)                                                    72
The Hobbit: The Battle of the Five Armies (2014)                          61
Paper Towns (2015)                                                        55
Big Eyes (2014)                                                           72
Maggie (2015)                                                             54
Focus (2015)                                                              57
The Second Best Exotic Marigold Hotel (2015)                              62
The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015)    67
Escobar: Paradise Lost (2015)                                             52
Into the Woods (2014)                                                     71
Inherent Vice (2014)                                                      73
Magic Mike XXL (2015)                                                     62
Woman in Gold (2015)                                                      52
The Last Five Years (2015)                                                60
Jurassic World (2015)                                                     71
Minions (2015)                                                            54
Spare Parts (2015)                                                        52
dtype: int64
'''

#data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2

print(rt_mean)
'''
FILM
Avengers: Age of Ultron (2015)               80.0
Cinderella (2015)                            82.5
Ant-Man (2015)                               85.0
Do You Believe? (2015)                       51.0
Hot Tub Time Machine 2 (2015)                21.0
                                             ... 
Mr. Holmes (2015)                            82.5
'71 (2015)                                   89.5
Two Days, One Night (2014)                   87.5
Gett: The Trial of Viviane Amsalem (2015)    90.5
Kumiko, The Treasure Hunter (2015)           75.0
Length: 146, dtype: float64
'''

# pandas_5

import pandas as pd

#will return a new DataFrame that is indexed by the values in the specified column 
#and will drop that column from the DataFrame
#without the FILM column dropped 
fandango = pd.read_csv('fandango_score_comparison.csv')
print (type(fandango))
fandango_films = fandango.set_index('FILM', drop=False)
#print(fandango_films.index)
'''
<class 'pandas.core.frame.DataFrame'>
'''

# Slice using either bracket notation or loc[]
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]

# Specific movie
fandango_films.loc['Kumiko, The Treasure Hunter (2015)']

# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
fandango_films.loc[movies]

#When selecting multiple rows, a DataFrame is returned, 
#but when selecting an individual row, a Series object is returned instead
'''
    FILM    RottenTomatoes    RottenTomatoes_User    Metacritic    Metacritic_User    IMDB    Fandango_Stars    Fandango_Ratingvalue    RT_norm    RT_user_norm    ...    IMDB_norm    RT_norm_round    RT_user_norm_round    Metacritic_norm_round    Metacritic_user_norm_round    IMDB_norm_round    Metacritic_user_vote_count    IMDB_user_vote_count    Fandango_votes    Fandango_Difference
FILM                                                                                    
Kumiko, The Treasure Hunter (2015)    Kumiko, The Treasure Hunter (2015)    87    63    68    6.4    6.7    3.5    3.5    4.35    3.15    ...    3.35    4.5    3.0    3.5    3.0    3.5    19    5289    41    0.0
Do You Believe? (2015)    Do You Believe? (2015)    18    84    22    4.7    5.4    5.0    4.5    0.90    4.20    ...    2.70    1.0    4.0    1.0    2.5    2.5    31    3136    1793    0.5
Ant-Man (2015)    Ant-Man (2015)    80    90    64    8.1    7.8    5.0    4.5    4.00    4.50    ...    3.90    4.0    4.5    3.0    4.0    4.0    627    103660    12055    0.5
3 rows × 22 columns
'''

#The apply() method in Pandas allows us to specify Python logic
#The apply() method requires you to pass in a vectorized operation 
#that can be applied over each Series object.
import numpy as np

# returns the data types as a Series
types = fandango_films.dtypes
#print types
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
#print float_df
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))

print(deviations)
'''
Metacritic_User               1.505529
IMDB                          0.955447
Fandango_Stars                0.538532
Fandango_Ratingvalue          0.501106
RT_norm                       1.503265
RT_user_norm                  0.997787
Metacritic_norm               0.972522
Metacritic_user_nom           0.752765
IMDB_norm                     0.477723
RT_norm_round                 1.509404
RT_user_norm_round            1.003559
Metacritic_norm_round         0.987561
Metacritic_user_norm_round    0.785412
IMDB_norm_round               0.501043
Fandango_Difference           0.152141
dtype: float64
'''

rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
rt_mt_user.apply(lambda x: np.std(x), axis=1)
'''
FILM
Avengers: Age of Ultron (2015)               0.375
Cinderella (2015)                            0.125
Ant-Man (2015)                               0.225
Do You Believe? (2015)                       0.925
Hot Tub Time Machine 2 (2015)                0.150
                                             ...  
Mr. Holmes (2015)                            0.025
'71 (2015)                                   0.175
Two Days, One Night (2014)                   0.250
Gett: The Trial of Viviane Amsalem (2015)    0.200
Kumiko, The Treasure Hunter (2015)           0.025
Length: 146, dtype: float64
'''

posted @ 2019-12-13 18:20 LXL_1 阅读(209) 评论(0) 收藏举报

刷新页面返回顶部

LXL_1

数据分析处理库pandas

公告