如何在R中的字符后截断字符串向量?
数据分析中最困难的问题是清除脏数据。在大多数情况下,数据都是脏形式的,其中一种脏情况是字符串向量,在特定字符之后具有不必要的值。因此,要截断字符后的字符串向量,我们可以使用stringr包中的str_split以及sapply函数,如以下示例所示。
library(stringr)
示例
x1<-sample(c("India#21","China#42","UK#14","Japan#22","United States#25","Egpyt#27","Sudan#36"),100,replace=TRUE) x1输出结果
[1] "China#42" "Egpyt#27" "India#21" [4] "China#42" "United States#25" "Japan#22" [7] "Egpyt#27" "Japan#22" "Egpyt#27" [10] "India#21" "India#21" "India#21" [13] "China#42" "Japan#22" "UK#14" [16] "India#21" "India#21" "China#42" [19] "United States#25" "Japan#22" "Sudan#36" [22] "China#42" "United States#25" "United States#25" [25] "Sudan#36" "India#21" "India#21" [28] "Sudan#36" "Egpyt#27" "Japan#22" [31] "UK#14" "UK#14" "UK#14" [34] "United States#25" "United States#25" "UK#14" [37] "Egpyt#27" "Egpyt#27" "India#21" [40] "India#21" "UK#14" "China#42" [43] "UK#14" "United States#25" "India#21" [46] "Egpyt#27" "Japan#22" "India#21" [49] "Japan#22" "UK#14" "Egpyt#27" [52] "Sudan#36" "Japan#22" "United States#25" [55] "Japan#22" "UK#14" "Sudan#36" [58] "Sudan#36" "Egpyt#27" "Sudan#36" [61] "India#21" "China#42" "Egpyt#27" [64] "Sudan#36" "Sudan#36" "Egpyt#27" [67] "China#42" "Japan#22" "Egpyt#27" [70] "China#42" "India#21" "United States#25" [73] "Egpyt#27" "United States#25" "India#21" [76] "Sudan#36" "Sudan#36" "India#21" [79] "Japan#22" "India#21" "Sudan#36" [82] "United States#25" "China#42" "China#42" [85] "Japan#22" "Egpyt#27" "China#42" [88] "Sudan#36" "United States#25" "United States#25" [91] "India#21" "Japan#22" "United States#25" [94] "China#42" "Japan#22" "Japan#22" [97] "Japan#22" "UK#14" "China#42" [100] "China#42"
示例
sapply(str_split(x1,"#",),'[',1)输出结果
[1] "China" "Egpyt" "India" "China" [5] "United States" "Japan" "Egpyt" "Japan" [9] "Egpyt" "India" "India" "India" [13] "China" "Japan" "UK" "India" [17] "India" "China" "United States" "Japan" [21] "Sudan" "China" "United States" "United States" [25] "Sudan" "India" "India" "Sudan" [29] "Egpyt" "Japan" "UK" "UK" [33] "UK" "United States" "United States" "UK" [37] "Egpyt" "Egpyt" "India" "India" [41] "UK" "China" "UK" "United States" [45] "India" "Egpyt" "Japan" "India" [49] "Japan" "UK" "Egpyt" "Sudan" [53] "Japan" "United States" "Japan" "UK" [57] "Sudan" "Sudan" "Egpyt" "Sudan" [61] "India" "China" "Egpyt" "Sudan" [65] "Sudan" "Egpyt" "China" "Japan" [69] "Egpyt" "China" "India" "United States" [73] "Egpyt" "United States" "India" "Sudan" [77] "Sudan" "India" "Japan" "India" [81] "Sudan" "United States" "China" "China" [85] "Japan" "Egpyt" "China" "Sudan" [89] "United States" "United States" "India" "Japan" [93] "United States" "China" "Japan" "Japan" [97] "Japan" "UK" "China" "China"
示例
x2<-sample(c("rahul@gmail.com","krishna@gmail.com","surbhi@gmail.com","shobhit@gmail.com","ujjal@gmail.com","nizam@gmail.com","supriya@gmail.com","rushi@gmail.com"),100,replace=TRUE) x2输出结果
[1] "supriya@gmail.com" "rahul@gmail.com" "krishna@gmail.com" [4] "shobhit@gmail.com" "nizam@gmail.com" "shobhit@gmail.com" [7] "nizam@gmail.com" "surbhi@gmail.com" "rushi@gmail.com" [10] "rushi@gmail.com" "rushi@gmail.com" "rushi@gmail.com" [13] "krishna@gmail.com" "shobhit@gmail.com" "ujjal@gmail.com" [16] "nizam@gmail.com" "supriya@gmail.com" "ujjal@gmail.com" [19] "ujjal@gmail.com" "supriya@gmail.com" "rahul@gmail.com" [22] "shobhit@gmail.com" "krishna@gmail.com" "nizam@gmail.com" [25] "shobhit@gmail.com" "rushi@gmail.com" "rushi@gmail.com" [28] "ujjal@gmail.com" "ujjal@gmail.com" "ujjal@gmail.com" [31] "supriya@gmail.com" "rahul@gmail.com" "ujjal@gmail.com" [34] "shobhit@gmail.com" "krishna@gmail.com" "krishna@gmail.com" [37] "shobhit@gmail.com" "surbhi@gmail.com" "nizam@gmail.com" [40] "surbhi@gmail.com" "ujjal@gmail.com" "shobhit@gmail.com" [43] "ujjal@gmail.com" "krishna@gmail.com" "supriya@gmail.com" [46] "ujjal@gmail.com" "supriya@gmail.com" "ujjal@gmail.com" [49] "ujjal@gmail.com" "rushi@gmail.com" "krishna@gmail.com" [52] "rahul@gmail.com" "nizam@gmail.com" "rushi@gmail.com" [55] "nizam@gmail.com" "surbhi@gmail.com" "rahul@gmail.com" [58] "supriya@gmail.com" "nizam@gmail.com" "shobhit@gmail.com" [61] "rahul@gmail.com" "shobhit@gmail.com" "supriya@gmail.com" [64] "shobhit@gmail.com" "rahul@gmail.com" "shobhit@gmail.com" [67] "ujjal@gmail.com" "supriya@gmail.com" "nizam@gmail.com" [70] "surbhi@gmail.com" "rushi@gmail.com" "rushi@gmail.com" [73] "rushi@gmail.com" "supriya@gmail.com" "surbhi@gmail.com" [76] "nizam@gmail.com" "rushi@gmail.com" "supriya@gmail.com" [79] "nizam@gmail.com" "rahul@gmail.com" "rahul@gmail.com" [82] "surbhi@gmail.com" "rushi@gmail.com" "ujjal@gmail.com" [85] "rahul@gmail.com" "rushi@gmail.com" "rushi@gmail.com" [88] "ujjal@gmail.com" "ujjal@gmail.com" "nizam@gmail.com" [91] "supriya@gmail.com" "surbhi@gmail.com" "nizam@gmail.com" [94] "surbhi@gmail.com" "supriya@gmail.com" "shobhit@gmail.com" [97] "supriya@gmail.com" "rahul@gmail.com" "nizam@gmail.com" [100] "rushi@gmail.com"
示例
sapply(str_split(x2,"@",),'[',1)输出结果
[1] "supriya" "rahul" "krishna" "shobhit" "nizam" "shobhit" "nizam" [8] "surbhi" "rushi" "rushi" "rushi" "rushi" "krishna" "shobhit" [15] "ujjal" "nizam" "supriya" "ujjal" "ujjal" "supriya" "rahul" [22] "shobhit" "krishna" "nizam" "shobhit" "rushi" "rushi" "ujjal" [29] "ujjal" "ujjal" "supriya" "rahul" "ujjal" "shobhit" "krishna" [36] "krishna" "shobhit" "surbhi" "nizam" "surbhi" "ujjal" "shobhit" [43] "ujjal" "krishna" "supriya" "ujjal" "supriya" "ujjal" "ujjal" [50] "rushi" "krishna" "rahul" "nizam" "rushi" "nizam" "surbhi" [57] "rahul" "supriya" "nizam" "shobhit" "rahul" "shobhit" "supriya" [64] "shobhit" "rahul" "shobhit" "ujjal" "supriya" "nizam" "surbhi" [71] "rushi" "rushi" "rushi" "supriya" "surbhi" "nizam" "rushi" [78] "supriya" "nizam" "rahul" "rahul" "surbhi" "rushi" "ujjal" [85] "rahul" "rushi" "rushi" "ujjal" "ujjal" "nizam" "supriya" [92] "surbhi" "nizam" "surbhi" "supriya" "shobhit" "supriya" "rahul" [99] "nizam" "rushi"