r - how can I find percentage of similar string within and across various columns -
i have data below
df<-structure(list(v1 = structure(c(5l, 1l, 7l, 3l, 2l, 4l, 6l, 6l ), .label = c("cpsiaaaiaavnalhgr", "dlnycfsgmsdhr", "fpehelivdpqr", "iadpdavkpddwdedapsk", "lwadhgvqacfgr", "wgeagaeyvvestgvfttmek", "yyvtiidapghr"), class = "factor"), v2 = structure(c(5l, 2l, 7l, 3l, 4l, 6l, 1l, 1l), .label = c("", "cpsiaaaiaavnalhgr", "gcitiigggdtatccak", "hvgpgvlsmanagpntngsqffictik", "llelgpkpevaqqtr", "mvccsawsedhpicnlftcgfdr", "yyvtiidapghr"), class = "factor"), v3 = structure(c(4l, 3l, 2l, 4l, 3l, 1l, 1l, 1l), .label = c("", "avcmlsnttaiaeawar", "dlnycfsgmsdhr", "fpehelivdpqr"), class = "factor")), .names = c("v1", "v2", "v3"), class = "data.frame", row.names = c(na, -8l))
i want know , how many of strings shared in each column
for example
cpsiaaaiaavnalhgr 1,2 yyvtiidapghr 1,2 wgeagaeyvvestgvfttmek 1,1 fpehelivdpqr 1,3 dlnycfsgmsdhr 1,3 dlnycfsgmsdhr 1,3
it means first string cpsiaaaiaavnalhgr
repeated in first column , second column. yyvtiidapghr
repetaed in first column , second column . etc etc
and give me percentage first column has 8 rows, among these 8 rows, shares 2 rows column 2 has 2/8*100 = 25% share first third share 3/8*100 = 37% column 2 column 3 shares 0% etc etc
i feel number of common stings after removing duplicates each row (i.e considering unique strings match) more useful. so, i'm updating solution , i'm adding more code venn diagram.
library(dplyr) library(tidyr) library(gplots) # reshape dataset df_reshaped = df %>% gather(column, string) %>% filter(string != '') %>% distinct() # dataset shows strings , in columns appear df_result1 = df_reshaped %>% group_by(string) %>% summarise(columns = paste(unique(column), collapse=",")) df_result1 # # tibble: 12 x 2 # string columns # <chr> <chr> # 1 avcmlsnttaiaeawar v3 # 2 cpsiaaaiaavnalhgr v1,v2 # 3 dlnycfsgmsdhr v1,v3 # 4 fpehelivdpqr v1,v3 # 5 gcitiigggdtatccak v2 # 6 hvgpgvlsmanagpntngsqffictik v2 # 7 iadpdavkpddwdedapsk v1 # 8 llelgpkpevaqqtr v2 # 9 lwadhgvqacfgr v1 # 10 mvccsawsedhpicnlftcgfdr v2 # 11 wgeagaeyvvestgvfttmek v1 # 12 yyvtiidapghr v1,v2 # function number of common rows f1 = function(v1, v2) { x1 = (df_reshaped %>% filter(column == v1))$string x2 = (df_reshaped %>% filter(column == v2))$string length(x2[x2 %in% x1]) } f1 = vectorize(f1) # function number of rows of each column f2 = function(v) {df_reshaped %>% filter(column == v) %>% nrow} f2 = vectorize(f2) # dataset shows overlap of columns (number of common strings) expand.grid(unique(df_reshaped$column), unique(df_reshaped$column)) %>% filter(var1 != var2) %>% mutate(numshared = f1(var1, var2), numrows = f2(var1), prc = numshared/numrows) %>% arrange(var1, var2) # var1 var2 numshared numrows prc # 1 v1 v2 2 7 0.2857143 # 2 v1 v3 2 7 0.2857143 # 3 v2 v1 2 6 0.3333333 # 4 v2 v3 0 6 0.0000000 # 5 v3 v1 2 3 0.6666667 # 6 v3 v2 0 3 0.0000000 # reshape dataset , create venn diagram df_reshaped %>% mutate(exist = true) %>% spread(column, exist, fill=false) %>% select(-string) %>% venn()
obviously, sum of numbers shown in diagram should equal number of unique strings in table df_result1
. 12 in our case.
Comments
Post a Comment