Rdplyr的单表动词

2024-04-15 12:35:04 228

示例

dplyr在中引入了数据处理语法R。它提供了一个一致的接口来处理数据，无论数据存储在哪里：data.frame，data.table或database。其中的关键部分dplyr是使用Rcpp编写的，这使得使用内存数据的速度非常快。

dplyr公司的理念是拥有能做一件事的小功能。这五个简单的功能（filter，arrange，select，mutate，和summarise）可用于揭示新的方式来描述数据。与结合使用时group_by，这些功能可用于计算按组汇总统计信息。

语法共性

所有这些功能都具有相似的语法：

所有这些功能的第一个参数始终是数据帧

可以使用裸变量名称直接引用列（即，不使用$）

这些函数不会修改原始数据本身，即它们没有副作用。因此，结果应始终保存到对象。

我们将使用内置的mtcars数据集来探索dplyr的单表动词。转换的类型的前mtcars向tbl_df（因为它使打印机），我们添加rownames作为使用列中的数据集的rownames_to_column函数从tibble包。

library(dplyr) # This documentation was written using version 0.5.0

mtcars_tbl <- as_data_frame(tibble::rownames_to_column(mtcars, "cars"))

# examine the structure of data
head(mtcars_tbl)

# A tibble: 6 x 12
#               cars   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#              <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1         Mazda RX4  21.0     6   160   110  3.90 2.620 16.46     0     1     4     4
#2     Mazda RX4 Wag  21.0     6   160   110  3.90 2.875 17.02     0     1     4     4
#3        Datsun 710  22.8     4   108    93  3.85 2.320 18.61     1     1     4     1
#4    Hornet 4 Drive  21.4     6   258   110  3.08 3.215 19.44     1     0     3     1
#5 Hornet Sportabout  18.7     8   360   175  3.15 3.440 17.02     0     0     3     2
#6           Valiant  18.1     6   225   105  2.76 3.460 20.22     1     0     3     1

过滤

filter帮助符合特定条件的子集行。第一个参数是的名称，data.frame第二个（及后续）参数是用于过滤数据的条件（这些条件的值应为TRUE或FALSE）

细分所有具有4个气缸的汽车-cyl：

filter(mtcars_tbl, cyl == 4) 

# A tibble: 11 x 12
#             cars   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#            <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1      Datsun 710  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
#2       Merc 240D  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
#3        Merc 230  22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2
#4        Fiat 128  32.4     4  78.7    66  4.08 2.200 19.47     1     1     4     1
#5     Honda Civic  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
# ... with 6 more rows

我们可以通过逗号分隔多个条件。要将具有4个或6个气缸-cyl且具有5个齿轮的汽车子集-gear：

filter(mtcars_tbl, cyl == 4 | cyl == 6, gear == 5)

# A tibble: 3 x 12
#           cars   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#          <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Porsche 914-2  26.0     4 120.3    91  4.43 2.140  16.7     0     1     5     2
#2  Lotus Europa  30.4     4  95.1   113  3.77 1.513  16.9     1     1     5     2
#3  Ferrari Dino  19.7     6 145.0   175  3.62 2.770  15.5     0     1     5     6

filter根据条件选择行，使用按位置选择行slice。slice仅使用2个参数：第一个是adata.frame，第二个是整数行值。

要选择第6至9行：

slice(mtcars_tbl, 6:9)

# A tibble: 4 x 12
#        cars   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#       <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1    Valiant  18.1     6 225.0   105  2.76  3.46 20.22     1     0     3     1
#2 Duster 360  14.3     8 360.0   245  3.21  3.57 15.84     0     0     3     4
#3  Merc 240D  24.4     4 146.7    62  3.69  3.19 20.00     1     0     4     2
#4   Merc 230  22.8     4 140.8    95  3.92  3.15 22.90     1     0     4     2

要么：

slice(mtcars_tbl, -c(1:5, 10:n()))

结果与输出相同slice(mtcars_tbl,6:9)

n()代表当前组中的观察数

安排

arrange用于按指定的排序数据variable(s)。与前一个动词（以及中的所有其他函数dplyr）一样，第一个参数是data.frame，随后的参数用于对数据进行排序。如果传递了多个变量，则数据首先按第一个变量排序，然后按第二个变量排序，依此类推。

要以马力订购数据-hp

arrange(mtcars_tbl, hp) 

# A tibble: 32 x 12
#             cars   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#            <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1     Honda Civic  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
#2       Merc 240D  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
#3  Toyota Corolla  33.9     4  71.1    65  4.22 1.835 19.90     1     1     4     1
#4        Fiat 128  32.4     4  78.7    66  4.08 2.200 19.47     1     1     4     1
#5       Fiat X1-9  27.3     4  79.0    66  4.08 1.935 18.90     1     1     4     1
#6   Porsche 914-2  26.0     4 120.3    91  4.43 2.140 16.70     0     1     5     2
# ... with 26 more rows

以英里/加仑为单位arrange的数据-降序排列，后跟气缸数-：mpgcyl

arrange(mtcars_tbl, desc(mpg), cyl)

# A tibble: 32 x 12
#             cars   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#            <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1  Toyota Corolla  33.9     4  71.1    65  4.22 1.835 19.90     1     1     4     1
#2        Fiat 128  32.4     4  78.7    66  4.08 2.200 19.47     1     1     4     1
#3     Honda Civic  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
#4    Lotus Europa  30.4     4  95.1   113  3.77 1.513 16.90     1     1     5     2
#5       Fiat X1-9  27.3     4  79.0    66  4.08 1.935 18.90     1     1     4     1
#6   Porsche 914-2  26.0     4 120.3    91  4.43 2.140 16.70     0     1     5     2
# ... with 26 more rows

选择

select用于仅选择变量的子集。只选择mpg，disp，wt，qsec，和vs来自mtcars_tbl：

select(mtcars_tbl, mpg, disp, wt, qsec, vs)

# A tibble: 32 x 5
#     mpg  disp    wt  qsec    vs
#   <dbl> <dbl> <dbl> <dbl> <dbl>
#1   21.0 160.0 2.620 16.46     0
#2   21.0 160.0 2.875 17.02     0
#3   22.8 108.0 2.320 18.61     1
#4   21.4 258.0 3.215 19.44     1
#5   18.7 360.0 3.440 17.02     0
#6   18.1 225.0 3.460 20.22     1
# ... with 26 more rows

:表示法可用于选择连续的列。要cars通过disp和vs来选择列carb：

select(mtcars_tbl, cars:disp, vs:carb)

# A tibble: 32 x 8
#                cars   mpg   cyl  disp    vs    am  gear  carb
#               <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1          Mazda RX4  21.0     6 160.0     0     1     4     4
#2      Mazda RX4 Wag  21.0     6 160.0     0     1     4     4
#3         Datsun 710  22.8     4 108.0     1     1     4     1
#4     Hornet 4 Drive  21.4     6 258.0     1     0     3     1
#5  Hornet Sportabout  18.7     8 360.0     0     0     3     2
#6            Valiant  18.1     6 225.0     1     0     3     1
# ... with 26 more rows

要么select(mtcars_tbl,-(hp:qsec))

对于包含多个列的数据集，按名称选择多个列可能很繁琐。为了使生活更轻松，也有一些辅助功能（如starts_with()，ends_with()，contains()，matches()，num_range()，one_of()，和everything()），可在使用select。要了解有关如何使用它们的更多信息，请参见?select_helpers和?select。

注意：在中直接引用列时select()，我们使用裸列名称，但是在帮助函数中引用列时应使用引号。

要在选择时重命名列：

select(mtcars_tbl, cylinders = cyl, displacement = disp) 

# A tibble: 32 x 2
#   cylinders displacement
#       <dbl>        <dbl>
#1          6        160.0
#2          6        160.0
#3          4        108.0
#4          6        258.0
#5          8        360.0
#6          6        225.0
# ... with 26 more rows

如预期的那样，这将删除所有其他变量。

要重命名列而不删除其他变量，请使用rename：

rename(mtcars_tbl, cylinders = cyl, displacement = disp)

# A tibble: 32 x 12
#                cars   mpg cylinders displacement    hp  drat    wt  qsec    vs
#               <chr> <dbl>     <dbl>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1          Mazda RX4  21.0         6        160.0   110  3.90 2.620 16.46     0
#2      Mazda RX4 Wag  21.0         6        160.0   110  3.90 2.875 17.02     0
#3         Datsun 710  22.8         4        108.0    93  3.85 2.320 18.61     1
#4     Hornet 4 Drive  21.4         6        258.0   110  3.08 3.215 19.44     1
#5  Hornet Sportabout  18.7         8        360.0   175  3.15 3.440 17.02     0
#6            Valiant  18.1         6        225.0   105  2.76 3.460 20.22     1
# ... with 26 more rows, and 3 more variables: am <dbl>, gear <dbl>, carb <dbl>

变异

mutate可用于向数据添加新列。与一样dplyr，mutate不会将新创建的列添加到原始数据中。列添加在的末尾data.frame。

mutate(mtcars_tbl, weight_ton = wt/2, weight_pounds = weight_ton * 2000)

# A tibble: 32 x 14
#                cars   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb weight_ton weight_pounds
#               <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>      <dbl>         <dbl>
#1          Mazda RX4  21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4     1.3100          2620
#2      Mazda RX4 Wag  21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4     1.4375          2875
#3         Datsun 710  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1     1.1600          2320
#4     Hornet 4 Drive  21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1     1.6075          3215
#5  Hornet Sportabout  18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2     1.7200          3440
#6            Valiant  18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1     1.7300          3460
# ... with 26 more rows

请注意weight_ton创建时的使用weight_pounds。与base不同R，mutate我们可以引用刚创建的列以用于后续操作。

要仅保留新创建的列，请使用transmute代替mutate：

transmute(mtcars_tbl, weight_ton = wt/2, weight_pounds = weight_ton * 2000)

# A tibble: 32 x 2
#   weight_ton weight_pounds
#        <dbl>         <dbl>
#1      1.3100          2620
#2      1.4375          2875
#3      1.1600          2320
#4      1.6075          3215
#5      1.7200          3440
#6      1.7300          3460
# ... with 26 more rows

总结

summarise通过将多个值折叠为一个值来计算变量的摘要统计量。它可以计算多个统计信息，我们可以在同一条语句中命名这些汇总列。

为了计算平均值和标准偏差的mpg，并disp在数据集中的所有车型：

summarise(mtcars_tbl, mean_mpg = mean(mpg), sd_mpg = sd(mpg), 
          mean_disp = mean(disp), sd_disp = sd(disp))

# A tibble: 1 x 4
#  mean_mpg   sd_mpg mean_disp  sd_disp
#     <dbl>    <dbl>     <dbl>    <dbl>
#1 20.09062 6.026948  230.7219 123.9387

通过...分组

group_by可用于对数据执行分组操作。将以上定义的动词应用于此分组数据时，它们会自动自动分别应用于每个组。

查找mean和sd的mpg依据cyl：

by_cyl <- group_by(mtcars_tbl, cyl)
summarise(by_cyl, mean_mpg = mean(mpg), sd_mpg = sd(mpg))


# A tibble: 3 x 3
#    cyl mean_mpg   sd_mpg
#  <dbl>    <dbl>    <dbl>
#1     4 26.66364 4.509828
#2     6 19.74286 1.453567
#3     8 15.10000 2.560048

全部放在一起

我们从选择列cars通过hp并gear，以便通过行cyl和从最高到最低mpg，组由数据gear，最后子集只有那些车有mpg>20hp>75

selected <- select(mtcars_tbl, cars:hp, gear)
ordered <- arrange(selected, cyl, desc(mpg))
by_cyl <- group_by(ordered, gear)
filter(by_cyl, mpg > 20, hp > 75)

Source: local data frame [9 x 6]
Groups: gear [3]

#            cars   mpg   cyl  disp    hp  gear
#           <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1   Lotus Europa  30.4     4  95.1   113     5
#2  Porsche 914-2  26.0     4 120.3    91     5
#3     Datsun 710  22.8     4 108.0    93     4
#4       Merc 230  22.8     4 140.8    95     4
#5  Toyota Corona  21.5     4 120.1    97     3
# ... with 4 more rows

也许我们对中间结果不感兴趣，我们可以通过包装函数调用来获得与上述相同的结果：

filter(
    group_by(
        arrange(
            select(
                mtcars_tbl, cars:hp
            ), cyl, desc(mpg)
        ), cyl   
    ),mpg > 20, hp > 75 
)

这可能有点难以理解。因此，dplyr可以使用管道%>%运算符将操作链接在一起。上面的代码可转换为：

mtcars_tbl %>% 
    select(cars:hp) %>% 
    arrange(cyl, desc(mpg)) %>%
    group_by(cyl) %>% 
    filter(mpg > 20, hp > 75)

汇总多列

dplyr提供summarise_all()将功能应用于所有（非分组）列的功能。

要查找每列的不同值的数量：

mtcars_tbl %>% 
    summarise_all(n_distinct)

# A tibble: 1 x 12
#   cars   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#  <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#1    32    25     3    27    22    22    29    30     2     2     3     6

通过以下方式查找每列的不同值的数量cyl：

mtcars_tbl %>% 
    group_by(cyl) %>% 
    summarise_all(n_distinct)

# A tibble: 3 x 12
#    cyl  cars   mpg  disp    hp  drat    wt  qsec    vs    am  gear  carb
#  <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#1     4    11     9    11    10    10    11    11     2     2     3     2
#2     6     7     6     5     4     5     6     7     2     2     3     3
#3     8    14    12    11     9    11    13    14     1     2     2     4

请注意，我们只需要添加该group_by语句，其余代码相同。现在，输出包括三行-对应于的每个唯一值cyl。

要指定summarise多个列，请使用summarise_at

mtcars_tbl %>% 
    group_by(cyl) %>% 
    summarise_at(c("mpg", "disp", "hp"), mean)

# A tibble: 3 x 4
#    cyl      mpg     disp        hp
#  <dbl>    <dbl>    <dbl>     <dbl>
#1     4 26.66364 105.1364  82.63636
#2     6 19.74286 183.3143 122.28571
#3     8 15.10000 353.1000 209.21429

helper?select_helpers可以使用函数（）代替列名来选择特定的列

要应用多个函数，请将函数名称作为字符向量传递：

mtcars_tbl %>% 
    group_by(cyl) %>% 
    summarise_at(c("mpg", "disp", "hp"), 
                 c("mean", "sd"))

或将它们包裹在里面funs：

mtcars_tbl %>% 
    group_by(cyl) %>% 
    summarise_at(c("mpg", "disp", "hp"), 
                 funs(mean, sd))

# A tibble: 3 x 7
#    cyl mpg_mean disp_mean   hp_mean   mpg_sd  disp_sd    hp_sd
#  <dbl>    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>    <dbl>
#1     4 26.66364  105.1364  82.63636 4.509828 26.87159 20.93453
#2     6 19.74286  183.3143 122.28571 1.453567 41.56246 24.26049
#3     8 15.10000  353.1000 209.21429 2.560048 67.77132 50.97689

现在，列名后面会附加函数名，以使它们与众不同。为了更改此设置，请传递要附加在函数后的名称：

mtcars_tbl %>% 
    group_by(cyl) %>% 
    summarise_at(c("mpg", "disp", "hp"), 
                 c(Mean = "mean", SD = "sd"))

mtcars_tbl %>% 
    group_by(cyl) %>% 
    summarise_at(c("mpg", "disp", "hp"), 
                 funs(Mean = mean, SD = sd))


# A tibble: 3 x 7
#    cyl mpg_Mean disp_Mean   hp_Mean   mpg_SD  disp_SD    hp_SD
#  <dbl>    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>    <dbl>
#1     4 26.66364  105.1364  82.63636 4.509828 26.87159 20.93453
#2     6 19.74286  183.3143 122.28571 1.453567 41.56246 24.26049
#3     8 15.10000  353.1000 209.21429 2.560048 67.77132 50.97689

要有条件地选择列，请使用summarise_if：

就拿mean被所有列的numeric按分组cyl：

mtcars_tbl %>% 
    group_by(cyl) %>% 
    summarise_if(is.numeric, mean) 

# A tibble: 3 x 11
#    cyl      mpg     disp        hp     drat       wt     qsec
#  <dbl>    <dbl>    <dbl>     <dbl>    <dbl>    <dbl>    <dbl>
#1     4 26.66364 105.1364  82.63636 4.070909 2.285727 19.13727
#2     6 19.74286 183.3143 122.28571 3.585714 3.117143 17.97714
#3     8 15.10000 353.1000 209.21429 3.229286 3.999214 16.77214
# ... with 4 more variables: vs <dbl>, am <dbl>, gear <dbl>,
#   carb <dbl>

但是，某些变量是离散的，mean其中的变量没有意义。

通过以下mean方式仅取连续变量cyl：

mtcars_tbl %>% 
    group_by(cyl) %>% 
    summarise_if(function(x) is.numeric(x) & n_distinct(x) > 6, mean)

# A tibble: 3 x 7
#    cyl      mpg     disp        hp     drat       wt     qsec
#  <dbl>    <dbl>    <dbl>     <dbl>    <dbl>    <dbl>    <dbl>
#1     4 26.66364 105.1364  82.63636 4.070909 2.285727 19.13727
#2     6 19.74286 183.3143 122.28571 3.585714 3.117143 17.97714
#3     8 15.10000 353.1000 209.21429 3.229286 3.999214 16.77214

Rdplyr的单表动词

示例

语法共性

过滤

安排

选择

变异

总结

通过...分组

全部放在一起

汇总多列

热门推荐

随机推荐