r - Tallying stringed labels -
i have data set df stringed labels want understand how each label appears. actual data large , need way generalize code i've come below. there better way tally individual labels after i've split them (step 2) columns , join them sum (step 4)? hints/help appreciated.
library(tidyverse) library(dplyr) library(ggplot2) library(reshape2) library(splitstackshape) df <- data.frame(v=c("a","a","b","b","b"),v1=c("place1-place2-place3-place4-place5-place6-place7", "place2-place4-place5-place7-place8", "place1-place2-place4-place7-place8-place9", "place3-place4-place2-place1", "place5-place6")) > df v v1 1 place1-place2-place3-place4-place5-place6-place7 2 place2-place4-place5-place7-place8 3 b place1-place2-place4-place7-place8-place9 4 b place3-place4-place2-place1 5 b place5-place6 # 1 - split stringed labels in v1 new columns df2<-csplit(df, "v1", sep="-", direction = "wide") # 2 - tally labels per new column c1_f <- df2 %>% group_by(v1_1) %>% rename(label = v1_1) %>% tally() c2_f <- df2 %>% group_by(v1_2) %>% rename(label = v1_2) %>% tally() %>% filter(label!="") c3_f <- df2 %>% group_by(v1_3) %>% rename(label = v1_3) %>% tally() %>% filter(label!="") c4_f <- df2 %>% group_by(v1_4) %>% rename(label = v1_4) %>% tally() %>% filter(label!="") c5_f <- df2 %>% group_by(v1_5) %>% rename(label = v1_5) %>% tally() %>% filter(label!="") # 3 - count total number of rows ctally <- df2 %>% summarise(count=n()) # 4 - join tallies label , plot in decreasing order c1_f %>% full_join(c2_f, = "label") %>% full_join(c3_f, = "label") %>% full_join(c4_f, = "label") %>% full_join(c5_f, = "label") %>% rowwise() %>% mutate(sum = sum(n, n.x, n.y,n.x.x, n.y.y, na.rm = true)) %>% select(label, sum) %>% mutate(pct = 100*sum/ctally$count) %>% # arrange(desc(sum)) %>% ggplot(aes(x=reorder(label,(sum)), y=pct)) + geom_col() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x = "label", y="% of time label appears in row", fill="") + ggtitle("labels associated rows") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
here general tidyverse
way,
library(tidyverse) df %>% mutate(cnt = n(), v1 = strsplit(as.character(v1), '-')) %>% unnest() %>% count(v1, cnt) %>% mutate(percentage = 100*n/cnt) %>% ggplot(aes(x = reorder(v1, percentage), y = percentage))+ geom_col()+ coord_flip()
Comments
Post a Comment