R语言 dplyr包
R编程语言中的dplyr包是 一个数据操作的结构,它提供了一套统一的动词,帮助解决最常见的数据操作障碍。
R语言中的dplyr包以更快、更简单的方式执行下面给出的步骤。
- 通过限制选择,现在可以把重点放在数据操作的困难上。
- 有一些简单的 “动词 “和函数来处理每一个常见的数据操作,并且想法可以更快地转化为代码。
- 有价值的后台,因此计算机的等待时间减少。
重要的动词功能
dplyr包提供了各种可用于数据处理的重要函数。这些是。
- filter()函数: 用于选择案例并以其值为基础进行操作。
# Create a data frame with missing data
d < - data.frame(name=c("Abhi", "Bhavesh",
                        "Chaman", "Dimri"),
                 age=c(7, 5, 9, 16),
                 ht=c(46, NA, NA, 69),
                 school=c("yes", "yes", "no", "no"))
d
 
# Finding rows with NA value
d % > % filter(is.na(ht))
 
# Finding rows with no NA value
d % > % filter(! is.na(ht))
输出
# A tibble: 4 x 4
  name      age    ht school
1 Abhi        7    46 yes   
2 Bhavesh     5    NA yes   
3 Chaman      9    NA no    
4 Dimri      16    69 no
# A tibble: 2 x 4
  name      age    ht school
1 Bhavesh     5    NA yes   
2 Chaman      9    NA no
# A tibble: 2 x 4
  name    age    ht school
1 Abhi      7    46 yes   
2 Dimri    16    69 no
- arrange(): 用于重新安排案例的顺序。
# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh", "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Arranging name according to the age
d.name<- arrange(d, age)
print(d.name)
输出
# A tibble: 4 x 4
  name      age    ht school
1 Bhavesh     5    NA yes   
2 Abhi        7    46 yes   
3 Chaman      9    NA no    
4 Dimri      16    69 no   
- select()和rename(): 用于选择变量并以其名称为基础进行操作。
# Create a data frame with missing data
d < - data.frame(name=c("Abhi", "Bhavesh",
                        "Chaman", "Dimri"),
                 age=c(7, 5, 9, 16),
                 ht=c(46, NA, NA, 69),
                 school=c("yes", "yes", "no", "no"))
 
# startswith() function to print only ht data
select(d, starts_with("ht"))
 
# -startswith() function to print
# everything except ht data
select(d, -starts_with("ht"))
 
# Printing column 1 to 2
select(d, 1: 2)
 
# Printing data of column
# heading containing 'a'
select(d, contains("a"))
 
# Printing data of column
# heading which matches 'na'
select(d, matches("na"))
输出
# A tibble: 4 x 1
     ht
1    46
2    NA
3    NA
4    69
# A tibble: 4 x 3
  name      age school
1 Abhi        7 yes   
2 Bhavesh     5 yes   
3 Chaman      9 no    
4 Dimri      16 no
# A tibble: 4 x 2
  name      age
1 Abhi        7
2 Bhavesh     5
3 Chaman      9
4 Dimri      16
# A tibble: 4 x 2
  name      age
1 Abhi        7
2 Bhavesh     5
3 Chaman      9
4 Dimri      16
# A tibble: 4 x 1
  name   
1 Abhi   
2 Bhavesh
3 Chaman 
4 Dimri
- mutate()和transmute(): 增加新的变量,这些变量是现有变量的功能。
# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Calculating a variable x3 which is sum of height
# and age printing with ht and age
mutate(d, x3 = ht + age)
 
# Calculating a variable x3 which is sum of height
# and age printing without ht and age
transmute(d, x3 = ht + age)
输出
# A tibble: 4 x 5
  name      age    ht school    x3
1 Abhi        7    46 yes       53
2 Bhavesh     5    NA yes       NA
3 Chaman      9    NA no        NA
4 Dimri      16    69 no        85
# A tibble: 4 x 1
     x3
1    53
2    NA
3    NA
4    85
> 
- summarise(): 将各种数值浓缩为一个数值。
# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Calculating mean of age
summarise(d, mean = mean(age))
 
# Calculating min of age
summarise(d, med = min(age))
 
# Calculating max of age
summarise(d, med = max(age))
 
# Calculating median of age
summarise(d, med = median(age))
输出
# A tibble: 1 x 1
      mean
1     9.25
# A tibble: 1 x 1
    med
1     5
# A tibble: 1 x 1
    med
1    16
# A tibble: 1 x 1
    med
1     8
- sample_n()和sample_frac(): 用于抽取随机标本。
# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Printing three rows
sample_n(d, 3)
 
# Printing 50 % of the rows
sample_frac(d, 0.50)
输出
# A tibble: 3 x 4
  name      age    ht school
1 Abhi        7    46 yes   
2 Bhavesh     5    NA yes   
3 Chaman      9    NA no 
# A tibble: 2 x 4
  name      age    ht school
1 Dimri      16    69 no    
2 Bhavesh     5    NA yes  
 极客教程
极客教程