小言_互联网的博客

生信&数据挖掘——人工神经网络篇(3)差异分析

319人阅读  评论(0)

在这篇文章中,我将讨论如何使用基因表达差异分析(DEA)来进行差异分析。DEA是一种有效的方法,可以从大量基因表达数据中检测和预测差异。我将介绍DEA的基本原理,并讨论如何使用它来进行差异分析。我还将介绍一些常用的DEA技术,以及它们如何用于差异分析。最后,我将介绍一些实际应用,以及如何使用DEA来解决实际问题。

先看看最终结果和名词解释

ID:ID是一个唯一的标识符,用于标识每个样本。

logFC:logFC是对数变化率的缩写,它表示两个样本之间的变化率,以对数形式表示。 (大于1上调,小于1下调)也可以是2(看今天需求)

AveExpr:AveExpr是平均表达量的缩写,它表示每个样本的平均表达量。

P.Value:P.Value是P值的缩写,它表示两个样本之间的差异是否显著。

P < 0.05:表示统计检验的结果显示,观察到的结果是显著的,概率值小于0.05,表明结果是有统计学意义的。

P 0.01:表示统计检验的结果显示,观察到的结果是极其显著的,概率值小于0.01,表明结果是有极强的统计学意义的。

P ≤ 0.001:表示统计检验的结果显示,观察到的结果是极其显著的,概率值小于或等于0.001,表明结果是有极强的统计学意义的。

P 0.0001:表示统计检验的结果显示,观察到的结果是极其显著的,概率值小于0.0001,表明结果是有极强的统计学意义的。

t:检验值,用于检验两组样本之间的差异是否具有统计学意义。

adj.P.Val:adj.P.Val是调整后的P值的缩写,它表示两个样本之间的差异是否显著,考虑了多重比较的影响。(一般看p值不看调整后的P值)

差异基因火山图

一般我们用于显示基因表达水平之间的差异。它通过将基因表达水平的差异映射到一个热图上,以便更容易地比较和观察不同样本之间的差异。热图中的右侧一行代表一个基因,颜色代表基因表达水平的差异,红色表示表达水平较高(高表达),蓝色表示表达水平较低(低表达)。

GSE65801和GSE54129数据先合并一下

输入文件:注释好的基因表达数据,只需要将txt放在和脚本同一文件夹里面


  
  1. #引用包
  2. library (limma )
  3. library (sva )
  4. outFile = "merge.txt" #输出文件
  5. setwd ( "C:\\05.sva" ) #设置工作目录
  6. #获取目录下所有".txt"结尾的文件
  7. files =dir ( )
  8. files =grep ( "txt$" , files , value = T )
  9. geneList = list ( )
  10. #读取所有txt文件中的基因信息,保存到geneList
  11. for (file in files ) {
  12. if (file ==outFile ) { next }
  13. rt =read.table (file , header = T , sep = "\t" , check.names = F ) #读取输入文件
  14. geneNames =as.vector (rt [ , 1 ] ) #提取基因名称
  15. uniqGene =unique (geneNames ) #基因取unique
  16. header =unlist (strsplit (file , "\\.|\\-" ) )
  17. geneList [[header [ 1 ] ] ] =uniqGene
  18. }
  19. #获取交集基因
  20. interGenes =Reduce (intersect , geneList )
  21. #数据合并
  22. allTab =data.frame ( )
  23. batchType = c ( )
  24. for (i in 1 : length (files ) ) {
  25. inputFile =files [i ]
  26. header =unlist (strsplit (inputFile , "\\.|\\-" ) )
  27. #读取输入文件,并对输入文件进行整理
  28. rt =read.table (inputFile , header = T , sep = "\t" , check.names = F )
  29. rt =as.matrix (rt )
  30. rownames (rt ) =rt [ , 1 ]
  31. exp =rt [ , 2 :ncol (rt ) ]
  32. dimnames = list (rownames ( exp ) ,colnames ( exp ) )
  33. data =matrix ( as.numeric (as.matrix ( exp ) ) ,nrow =nrow ( exp ) , dimnames = dimnames )
  34. rt =avereps (data )
  35. #对数值大的数据取log2
  36. qx = as.numeric (quantile (rt , c ( 0 , 0.25 , 0.5 , 0.75 , 0.99 , 1.0 ) , na.rm = T ) )
  37. LogC = ( (qx [ 5 ] > 100 ) || ( (qx [ 6 ] -qx [ 1 ] ) > 50 && qx [ 2 ] > 0 ) )
  38. if (LogC ) {
  39. rt [rt < 0 ] = 0
  40. rt =log2 (rt + 1 ) }
  41. rt =normalizeBetweenArrays (rt )
  42. #数据合并
  43. if (i == 1 ) {
  44. allTab =rt [interGenes , ]
  45. } else {
  46. allTab =cbind (allTab , rt [interGenes , ] )
  47. }
  48. batchType = c (batchType , rep (i ,ncol (rt ) ) )
  49. }
  50. #对数据进行矫正,输出矫正后的结果
  51. outTab =ComBat (allTab , batchType , par.prior = TRUE )
  52. outTab =rbind (geneNames =colnames (outTab ) , outTab )
  53. write.table (outTab , file = "merge.txt" , sep = "\t" , quote = F , col.names = F )

结果生成merge.txt

将merge.txt和他们对照组和实验组的信息放在同一个文件夹

脚本如下:


  
  1. #if (!requireNamespace("BiocManager", quietly = TRUE))
  2. # install.packages("BiocManager")
  3. #BiocManager::install("limma")
  4. #install.packages("pheatmap")
  5. #引用包
  6. library (limma )
  7. library (pheatmap )
  8. inputFile = "merge.txt" #输入文件
  9. logFCfilter = 2 #logFC过滤阈值
  10. adj.P.Val.Filter = 0.05 #矫正后p值阈值
  11. setwd ( "C:\\biowolf\\Diagnostic\\06.diff" ) #设置工作目录
  12. #读取输入文件,并对输入文件整理
  13. rt =read.table (inputFile , header = T , sep = "\t" , check.names = F )
  14. rt =as.matrix (rt )
  15. rownames (rt ) =rt [ , 1 ]
  16. exp =rt [ , 2 :ncol (rt ) ]
  17. dimnames = list (rownames ( exp ) ,colnames ( exp ) )
  18. data =matrix ( as.numeric (as.matrix ( exp ) ) ,nrow =nrow ( exp ) , dimnames = dimnames )
  19. data =avereps (data )
  20. data =data [rowMeans (data ) > 0 , ]
  21. #读取目录下所有"s1.txt"结尾的文件
  22. sampleName1 = c ( )
  23. files =dir ( )
  24. files =grep ( "s1.txt$" , files , value = T )
  25. for (file in files ) {
  26. rt =read.table (file , header = F , sep = "\t" , check.names = F ) #读取输入文件
  27. geneNames =as.vector (rt [ , 1 ] ) #提取基因名称
  28. uniqGene =unique (geneNames ) #基因取unique
  29. sampleName1 = c (sampleName1 , uniqGene )
  30. }
  31. #读取目录下所有"s2.txt"结尾的文件
  32. sampleName2 = c ( )
  33. files =dir ( )
  34. files =grep ( "s2.txt$" , files , value = T )
  35. for (file in files ) {
  36. rt =read.table (file , header = F , sep = "\t" , check.names = F ) #读取输入文件
  37. geneNames =as.vector (rt [ , 1 ] ) #提取基因名称
  38. uniqGene =unique (geneNames ) #基因取unique
  39. sampleName2 = c (sampleName2 , uniqGene )
  40. }
  41. #提取实验组和对照组的数据
  42. conData =data [ ,sampleName1 ]
  43. treatData =data [ ,sampleName2 ]
  44. data =cbind (conData ,treatData )
  45. conNum =ncol (conData )
  46. treatNum =ncol (treatData )
  47. #差异分析
  48. Type = c ( rep ( "con" ,conNum ) , rep ( "treat" ,treatNum ) )
  49. design <- model.matrix ( ~ 0 +factor (Type ) )
  50. colnames (design ) <- c ( "con" , "treat" )
  51. fit <- lmFit (data ,design )
  52. cont.matrix <-makeContrasts (treat -con ,levels =design )
  53. fit2 <- contrasts.fit (fit , cont.matrix )
  54. fit2 <- eBayes (fit2 )
  55. allDiff =topTable (fit2 ,adjust = 'fdr' ,number = 200000 )
  56. allDiffOut =rbind (id =colnames (allDiff ) ,allDiff )
  57. write.table (allDiffOut , file = "all.txt" , sep = "\t" , quote = F , col.names = F )
  58. #输出矫正后的表达量
  59. outData =rbind (id =paste0 (colnames (data ) , "_" ,Type ) ,data )
  60. write.table (outData , file = "normalize.txt" , sep = "\t" , quote = F , col.names = F )
  61. #输出差异结果
  62. diffSig =allDiff [with (allDiff , ( abs (logFC ) >logFCfilter & adj.P.Val < adj.P.Val.Filter ) ) , ]
  63. diffSigOut =rbind (id =colnames (diffSig ) ,diffSig )
  64. write.table (diffSigOut , file = "diff.txt" , sep = "\t" , quote = F , col.names = F )
  65. #输出差异基因表达量
  66. diffGeneExp =data [row.names (diffSig ) , ]
  67. diffGeneExpOut =rbind (id =paste0 (colnames (diffGeneExp ) , "_" ,Type ) , diffGeneExp )
  68. write.table (diffGeneExpOut , file = "diffGeneExp.txt" , sep = "\t" , quote = F , col.names = F )
  69. #绘制差异基因热图
  70. geneNum = 50
  71. diffSig =diffSig [order ( as.numeric (as.vector (diffSig $logFC ) ) ) , ]
  72. diffGeneName =as.vector (rownames (diffSig ) )
  73. diffLength = length (diffGeneName )
  74. hmGene = c ( )
  75. if (diffLength > ( 2 *geneNum ) ) {
  76. hmGene =diffGeneName [ c ( 1 :geneNum , (diffLength -geneNum + 1 ) :diffLength ) ]
  77. } else {
  78. hmGene =diffGeneName
  79. }
  80. hmExp =data [hmGene , ]
  81. Type = c ( rep ( "Con" ,conNum ) , rep ( "Treat" ,treatNum ) )
  82. names (Type ) =colnames (data )
  83. Type =as.data.frame (Type )
  84. pdf (file = "heatmap.pdf" , width = 10 , height = 8 )
  85. pheatmap (hmExp ,
  86. annotation =Type ,
  87. color = colorRampPalette ( c ( "blue" , "white" , "red" ) ) ( 50 ) ,
  88. cluster_cols = F ,
  89. show_colnames = F ,
  90. scale = "row" ,
  91. fontsize = 8 ,
  92. fontsize_row = 7 ,
  93. fontsize_col = 8 )
  94. dev.off ( )

最终输出热图和差异表达的值

绘制火山图

输入文件 all.txt 上一步差异基因图得到的

脚本如下:


  
  1. #install.packages("ggplot2")
  2. library (ggplot2 ) #引用包
  3. logFCfilter = 2 #logFC过滤条件
  4. adj.P.Val.Filter = 0.05 #矫正后的p值过滤条件
  5. inputFile = "all.txt" #输入文件
  6. setwd ( "C:\\biowolf\\neuralDiagnostic\\07.volcano" ) #设置工作目录
  7. #读取输入文件
  8. rt =read.table (inputFile , header = T , sep = "\t" , check.names = F )
  9. #定义显著性
  10. Sig =ifelse ( (rt $adj.P.Val <adj.P.Val.Filter ) & ( abs (rt $logFC ) >logFCfilter ) , ifelse (rt $logFC >logFCfilter , "Up" , "Down" ) , "Not" )
  11. rt =cbind (rt , Sig =Sig )
  12. #绘制火山图
  13. p =ggplot (rt , aes (logFC , -log10 (adj.P.Val ) ) ) +
  14. geom_point (aes (col =Sig ) ) +
  15. scale_color_manual (values = c ( "green" , "black" , "red" ) ) +
  16. xlim ( - 5 , 5 ) +
  17. labs (title = " " ) +
  18. geom_vline (xintercept = c ( -logFCfilter ,logFCfilter ) , col = "blue" , cex = 1 , linetype = 2 ) +
  19. geom_hline (yintercept = -log10 (adj.P.Val.Filter ) , col = "blue" , cex = 1 , linetype = 2 ) +
  20. theme (plot.title =element_text (size = 16 , hjust = 0.5 , face = "bold" ) )
  21. p =p +theme_bw ( )
  22. #输出火山图
  23. pdf (file = "volcano.pdf" , width = 6 , height = 5.1 )
  24. print (p )
  25. dev.off ( )

 

绿色是下调,红色是上调的基因


转载:https://blog.csdn.net/qq_62932195/article/details/128743699
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场