R语言 dplyr包

R编程语言中的dplyr包是 一个数据操作的结构，它提供了一套统一的动词，帮助解决最常见的数据操作障碍。

R语言中的dplyr包以更快、更简单的方式执行下面给出的步骤。

通过限制选择，现在可以把重点放在数据操作的困难上。
有一些简单的 “动词 “和函数来处理每一个常见的数据操作，并且想法可以更快地转化为代码。
有价值的后台，因此计算机的等待时间减少。

重要的动词功能

dplyr包提供了各种可用于数据处理的重要函数。这些是。

filter()函数： 用于选择案例并以其值为基础进行操作。

# Create a data frame with missing data
d < - data.frame(name=c("Abhi", "Bhavesh",
                        "Chaman", "Dimri"),
                 age=c(7, 5, 9, 16),
                 ht=c(46, NA, NA, 69),
                 school=c("yes", "yes", "no", "no"))
d
 
# Finding rows with NA value
d % > % filter(is.na(ht))
 
# Finding rows with no NA value
d % > % filter(! is.na(ht))

输出

# A tibble: 4 x 4
  name      age    ht school

1 Abhi        7    46 yes   
2 Bhavesh     5    NA yes   
3 Chaman      9    NA no    
4 Dimri      16    69 no

# A tibble: 2 x 4
  name      age    ht school

1 Bhavesh     5    NA yes   
2 Chaman      9    NA no

# A tibble: 2 x 4
  name    age    ht school

1 Abhi      7    46 yes   
2 Dimri    16    69 no

arrange(): 用于重新安排案例的顺序。

# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh", "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Arranging name according to the age
d.name<- arrange(d, age)
print(d.name)

输出

# A tibble: 4 x 4
  name      age    ht school

1 Bhavesh     5    NA yes   
2 Abhi        7    46 yes   
3 Chaman      9    NA no    
4 Dimri      16    69 no

select()和rename()： 用于选择变量并以其名称为基础进行操作。

# Create a data frame with missing data
d < - data.frame(name=c("Abhi", "Bhavesh",
                        "Chaman", "Dimri"),
                 age=c(7, 5, 9, 16),
                 ht=c(46, NA, NA, 69),
                 school=c("yes", "yes", "no", "no"))
 
# startswith() function to print only ht data
select(d, starts_with("ht"))
 
# -startswith() function to print
# everything except ht data
select(d, -starts_with("ht"))
 
# Printing column 1 to 2
select(d, 1: 2)
 
# Printing data of column
# heading containing 'a'
select(d, contains("a"))
 
# Printing data of column
# heading which matches 'na'
select(d, matches("na"))

输出

# A tibble: 4 x 1
     ht

1    46
2    NA
3    NA
4    69

# A tibble: 4 x 3
  name      age school

1 Abhi        7 yes   
2 Bhavesh     5 yes   
3 Chaman      9 no    
4 Dimri      16 no

# A tibble: 4 x 2
  name      age

1 Abhi        7
2 Bhavesh     5
3 Chaman      9
4 Dimri      16

# A tibble: 4 x 2
  name      age

1 Abhi        7
2 Bhavesh     5
3 Chaman      9
4 Dimri      16

# A tibble: 4 x 1
  name   

1 Abhi   
2 Bhavesh
3 Chaman 
4 Dimri

mutate()和transmute()： 增加新的变量，这些变量是现有变量的功能。

# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Calculating a variable x3 which is sum of height
# and age printing with ht and age
mutate(d, x3 = ht + age)
 
# Calculating a variable x3 which is sum of height
# and age printing without ht and age
transmute(d, x3 = ht + age)

输出


# A tibble: 4 x 5
  name      age    ht school    x3

1 Abhi        7    46 yes       53
2 Bhavesh     5    NA yes       NA
3 Chaman      9    NA no        NA
4 Dimri      16    69 no        85

# A tibble: 4 x 1
     x3

1    53
2    NA
3    NA
4    85
>

summarise(): 将各种数值浓缩为一个数值。

# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Calculating mean of age
summarise(d, mean = mean(age))
 
# Calculating min of age
summarise(d, med = min(age))
 
# Calculating max of age
summarise(d, med = max(age))
 
# Calculating median of age
summarise(d, med = median(age))

输出

# A tibble: 1 x 1
      mean

1     9.25

# A tibble: 1 x 1
    med

1     5

# A tibble: 1 x 1
    med

1    16

# A tibble: 1 x 1
    med

1     8

sample_n()和sample_frac()： 用于抽取随机标本。

# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Printing three rows
sample_n(d, 3)
 
# Printing 50 % of the rows
sample_frac(d, 0.50)

输出

# A tibble: 3 x 4
  name      age    ht school

1 Abhi        7    46 yes   
2 Bhavesh     5    NA yes   
3 Chaman      9    NA no 

# A tibble: 2 x 4
  name      age    ht school

1 Dimri      16    69 no    
2 Bhavesh     5    NA yes