Importing data in R 1

Importing data in R 学习笔记1

flat files:CSV

# Import swimming_pools.csv correctly: pools
pools<-read.csv("swimming_pools.csv",stringsAsFactors=FALSE)

txt文件

read.delim("name.txt",header=TRUE)

转化为table
# Path to the hotdogs.txt file: path
> path <- file.path("data", "hotdogs.txt")
> 
> # Import the hotdogs.txt file: hotdogs
> hotdogs <- read.table(path, 
                        sep = "\t", 
                        col.names = c("type", "calories", "sodium"))
> 
> # Call head() on hotdogs
> head(hotdogs)
  type calories sodium
1 Beef      186    495
2 Beef      181    477
3 Beef      176    425
4 Beef      149    322
5 Beef      184    482
6 Beef      190    587

tibble:简单数据框
read_对比read.
前者产生一个简单的数据框,并且会展示每一列的数据类型

packages:readr

read_csv()

读入csv格式
read_csv and read_tsv are special cases of the general read_delim. They're useful for reading the most common types of flat file data, comma separated values and tab separated values, respectively. read_csv2 uses ; for separators, instead of ,. This is common in European countries which use , as the decimal separator

read_tsv

读入txt格式

> # readr is already loaded
> 
> # Column names
> properties <- c("area", "temp", "size", "storage", "method",
                  "texture", "flavor", "moistness")
> 
> # Import potatoes.txt: potatoes
读入数据并指定行名
> potatoes<-read_tsv("potatoes.txt",col_names=properties)
Parsed with column specification:
cols(
  area = col_integer(),
  temp = col_integer(),
  size = col_integer(),
  storage = col_integer(),
  method = col_integer(),
  texture = col_double(),
  flavor = col_double(),
  moistness = col_double()
)
> col_names=properties
> 
> # Call head() on potatoes
> head(potatoes)
# A tibble: 6 x 8
   area  temp  size storage method texture flavor moistness
  <int> <int> <int>   <int>  <int>   <dbl>  <dbl>     <dbl>
1     1     1     1       1      1     2.9    3.2       3  
2     1     1     1       1      2     2.3    2.5       2.6
3     1     1     1       1      3     2.5    2.8       2.8
4     1     1     1       1      4     2.1    2.9       2.4
5     1     1     1       1      5     1.9    2.8       2.2
6     1     1     1       2      1     1.8    3         1.7

read_delim()

# Column names
> properties <- c("area", "temp", "size", "storage", "method",
                  "texture", "flavor", "moistness")
> 
> # Import potatoes.txt using read_delim(): potatoes
> potatoes <- read_delim("potatoes.txt", delim = "\t", col_names = properties)
Parsed with column specification:
cols(
  area = col_integer(),
  temp = col_integer(),
  size = col_integer(),
  storage = col_integer(),
  method = col_integer(),
  texture = col_double(),
  flavor = col_double(),
  moistness = col_double()
)
> 
> # Print out potatoes
> potatoes
# A tibble: 160 x 8
    area  temp  size storage method texture flavor moistness
   <int> <int> <int>   <int>  <int>   <dbl>  <dbl>     <dbl>
 1     1     1     1       1      1     2.9    3.2       3  
 2     1     1     1       1      2     2.3    2.5       2.6
 3     1     1     1       1      3     2.5    2.8       2.8
 4     1     1     1       1      4     2.1    2.9       2.4
 5     1     1     1       1      5     1.9    2.8       2.2
 6     1     1     1       2      1     1.8    3         1.7
 7     1     1     1       2      2     2.6    3.1       2.4
 8     1     1     1       2      3     3      3         2.9
 9     1     1     1       2      4     2.2    3.2       2.5
10     1     1     1       2      5     2      2.8       1.9
# ... with 150 more rows

data.table()

fread

make up some column names itself
more convenience

 # Import columns 6 and 8 of potatoes.csv: potatoes
> potatoes<-fread("potatoes.csv",select=c(6,8))
> 
> # Plot texture (x) and moistness (y) of potatoes
> plot(potatoes$texture,potatoes$moistness)

readxl

excel_sheets()

library(readxl)
# Print the names of all worksheets
excel_sheets("urbanpop.xlsx")
# Read all Excel sheets with lapply(): pop_list
pop_list<- lapply(excel_sheets("urbanpop.xlsx"),
                      read_excel,
                      path = "urbanpop.xlsx")

# Display the structure of pop_list
str(pop_list)

read_excel()

# Import the second sheet of urbanpop.xlsx, skipping the first 21 rows: urbanpop_sel
urbanpop_sel <- read_excel("urbanpop.xlsx", sheet = 2, col_names = FALSE, skip = 21)

# Print out the first observation from urbanpop_sel
urbanpop_sel[1,]

gdata

read.xls()

读入xls格式的数据

# Column names for urban_pop
> columns <- c("country", paste0("year_", 1967:1974))
> 
> # Finish the read.xls call
> urban_pop <- read.xls("urbanpop.xls", sheet = 2,
                        skip = 50, header = FALSE, stringsAsFactors = FALSE,
                        col.names = columns)
> 
> # Print first 10 observation of urban_pop
> head(urban_pop,n=10)
              country   year_1967   year_1968   year_1969   year_1970
1              Cyprus   231929.74   237831.38   243983.34   250164.52
2      Czech Republic  6204409.91  6266304.50  6326368.97  6348794.89
3             Denmark  3777552.62  3826785.08  3874313.99  3930042.97
4            Djibouti    77788.04    84694.35    92045.77    99845.22
5            Dominica    27550.36    29527.32    31475.62    33328.25
6  Dominican Republic  1535485.43  1625455.76  1718315.40  1814060.00
7             Ecuador  2059355.12  2151395.14  2246890.79  2345864.41
8               Egypt 13798171.00 14248342.19 14703858.22 15162858.52
9         El Salvador  1345528.98  1387218.33  1429378.98  1472181.26
10  Equatorial Guinea    75364.50    77295.03    78445.74    78411.07
     year_1971   year_1972   year_1973   year_1974
1    261213.21   272407.99   283774.90   295379.83
2   6437055.17  6572632.32  6718465.53  6873458.18
3   3981360.12  4028247.92  4076867.28  4120201.43
4    107799.69   116098.23   125391.58   136606.25
5     34761.52    36049.99    37260.05    38501.47
6   1915590.38  2020157.01  2127714.45  2238203.87
7   2453817.78  2565644.81  2681525.25  2801692.62
8  15603661.36 16047814.69 16498633.27 16960827.93
9   1527985.34  1584758.18  1642098.95  1699470.87
10    77055.29    74596.06    71438.96    68179.26

getSheets()

查看一个excel文件有多少的sheet,输出每个sheet的名字

XLConnect

loadWorkbook()

主要是加载excel文件
When working with XLConnect, the first step will be to load a workbook in your R session with loadWorkbook(); this function will build a "bridge" between your Excel file and your R session.

library("XLConnect")
> 
> # Build connection to urbanpop.xlsx: my_book
> my_book<-loadWorkbook("urbanpop.xlsx")
> 
> # Print out the class of my_book
> class(my_book)
[1] "workbook"
attr(,"package")
[1] "XLConnect"

readWorksheet()

读取excel文件
所以顺序肯定是先加载再读取啊。

# Import columns 3, 4, and 5 from second sheet in my_book: urbanpop_sel
urbanpop_sel <- readWorksheet(my_book, sheet = 2,startCol=3,endCol=5)

# Import first column from second sheet in my_book: countries
countries<-readWorksheet(my_book, sheet = 2,startCol=1,endCol=1)

# cbind() urbanpop_sel and countries together: selection
selection<-cbind(countries,urbanpop_sel)

createSheet()

在已经有的excel中创建一个sheet,创建一个空的sheet

# Build connection to urbanpop.xlsx
> my_book <- loadWorkbook("urbanpop.xlsx")
> 
> # Add a worksheet to my_book, named "data_summary"
> createSheet(my_book,"data_summary")
> 
> # Use getSheets() on my_book
> getSheets(my_book)
[1] "1960-1966"    "1967-1974"    "1975-2011"    "data_summary"

writeWorksheet()

Writes data to worksheets of a '>workbook.

saveWorkbook

保存工作表,就是存到磁盘上

# Build connection to urbanpop.xlsx
my_book <- loadWorkbook("urbanpop.xlsx")

# Add a worksheet to my_book, named "data_summary"
createSheet(my_book, "data_summary")

# Create data frame: summ
sheets <- getSheets(my_book)[1:3]
dims <- sapply(sheets, function(x) dim(readWorksheet(my_book, sheet = x)), USE.NAMES = FALSE)
summ <- data.frame(sheets = sheets,
                   nrows = dims[1, ],
                   ncols = dims[2, ])

# Add data in summ to "data_summary" sheet
writeWorksheet(my_book,summ,"data_summary")

# Save workbook as summary.xlsx
 saveWorkbook(my_book,"summary.xlsx")

renameSheet()

给sheet表重命名

# Rename "data_summary" sheet to "summary"
renameSheet(my_book, "data_summary", "summary")

# Print out sheets of my_book
getSheets(my_book)

# Save workbook to "renamed.xlsx"
saveWorkbook(my_book, file = "renamed.xlsx")

我发现我自己真的很容易丢参数哦,然后死活调不出来。。。===。。。苦恼的人儿

removeSheet()

删除指定sheet

library(XLConnect)
# Build connection to renamed.xlsx: my_book
my_book<-loadWorkbook("renamed.xlsx")


# Remove the fourth sheet
removeSheet(my_book,sheet="summary")

# Save workbook to "clean.xlsx"
saveWorkbook(my_book,"clean.xlsx")
posted @ 2019-12-13 18:54  高文星星  阅读(310)  评论(0编辑  收藏  举报