用R根据logFC和p值批量标注基因上下调的N种方法
- 2019 年 12 月 19 日
- 筆記
情景:假如有下面这些基因
expr logFC p.value gene1 2.4667984 -2.9302068 0.07878848 gene2 1.4482891 -2.9680565 0.04675735 gene3 0.2481085 0.1787332 0.01685758 gene4 0.4244537 -1.0029163 0.02281603 gene5 1.6186835 -1.8350010 0.07323936 gene6 3.3965326 -2.2189805 0.04056557
想达到下面这种效果: p.value<0.05的前提下 logFC>1标记为上调,logFC<-1的标记为下调
expr logFC p.value regulation gene1 2.4667984 -2.9302068 0.07878848 none gene2 1.4482891 -2.9680565 0.04675735 down gene3 0.2481085 0.1787332 0.01685758 none gene4 0.4244537 -1.0029163 0.02281603 down gene5 1.6186835 -1.8350010 0.07323936 none gene6 3.3965326 -2.2189805 0.04056557 down
下面是用R实现的几种方式:
目标:筛选差异基因,标注上调下调
p.value小于0.05,且logFC绝对值大于1的为DEG
先建立模拟数据
set.seed(1445) df <- data.frame(expr = runif(100,0.01,5), logFC = runif(100,-3,3), p.value = runif(100,0,0.1)) rownames(df) <- paste0("gene",1:100) head(df) test_p <- df$p.value <= 0.05#p.value<0.05 test_up <- df$logFC >=1#上调 test_down <- df$logFC <=-1#下调
第一种方法:逻辑判断转为数字1和0,然后赋值
添加列,下调的乘以10的原因属个人喜好,但我觉得很有用
library(dplyr) df <- mutate(df, regulation=test_p+test_up+10*test_down, method1 = "") table(df$regulation) #重新赋值 df[df$regulation==2,"method1"] <- "up" df[df$regulation==11,"method1"] <- "down" df[df$regulation==0|df$regulation==1|df$regulation==10,"method1"] <- "none"
第二种方法:逻辑判断转为数字1和0,然后用ifelse
df$method2 <- ifelse(df$regulation==2, "up", ifelse(df$regulation==11, "down", "none")) head(df)
第三种方法:逻辑判断转为数字1和0,然后用查询表
lookup <- c("2"="up","11"="down","0"="none","1"="none","10"="none") df$method3 <- lookup[as.character(df$regulation)] head(df)
第四种方法:逻辑判断转为数字1和0,然后用dplyr包的case_when
df$method4 <- case_when(df$regulation == 2 ~ "up", df$regulation == 11 ~ "down", !df$regulation==2 |!df$regulation==11 ~ "none")
第五种方法:ifelse直接判断任何赋值
df$method5 <- ifelse(test_p & test_up, "up", ifelse(test_p & test_down, "down","none"))
第六种方法:dplyr的case_when
df$method6 <- case_when(test_p & test_up ~ "up", test_p & test_down ~ "down", !test_p|!(test_down|test_up) ~ "none")
第七种方法:逻辑判断转为数字1和0,然后用函数和for循环来标记
先写函数
my_regulation <- function(x){ if(x==2){ print("up") }else if(x==11){ print("down") }else print("none") } #循环 method7 <- vector("character", nrow(df)) for (i in 1:nrow(df)) { method7[i] <- my_regulation(df$regulation[i]) i <- i+1 } #赋值 df$method7 <- data.frame(method7) head(df)
第八种方法:直接用函数和for循环
先关于df的函数
my_regulation2 <- function(x){ if(df$p.value[x]<0.05 & df$logFC[x]>1){ print("up") }else if(df$p.value[x]<0.05 & df$logFC[x]< -1){ print("down") }else print("none") } #循环 method8 <- vector("character",nrow(df)) for (i in 1:nrow(df)) { method8[i] <- my_regulation2(i) i <- i+1 } df$method8 <- method8 tail(df)
最终结果
> head(df) expr logFC p.value regulation method1 method2 method3 method4 method5 method6 method7 method8 1 2.4667984 -2.9302068 0.07878848 10 none none none none none none none none 2 1.4482891 -2.9680565 0.04675735 11 down down down down down down down down 3 0.2481085 0.1787332 0.01685758 1 none none none none none none none none 4 0.4244537 -1.0029163 0.02281603 11 down down down down down down down down 5 1.6186835 -1.8350010 0.07323936 10 none none none none none none none none 6 3.3965326 -2.2189805 0.04056557 11 down down down down down down down down
检查是不是每个方法结果一样
for (i in 1:7) { mi <- paste0('method',i) print(sum(df[,paste0('method',i)]==df[,paste0('method',i+1)])) i <- i+1 }
结果如下
[1] 100 [1] 100 [1] 100 [1] 100 [1] 100 [1] 100 [1] 100