From 0022645d04de5d0fc9628b5f4fecf07a6efb9a48 Mon Sep 17 00:00:00 2001
From: Bhanu Priya <bhanupriya151@gmail.com>
Date: Mon, 15 Aug 2022 14:31:47 +0530
Subject: [PATCH] Add pipeline 2 - DGE analysis

---
 .../shodhka/test_20220812/data/SampleGroups.txt  |  9 +++++++++
 .../test_20220812/data/SampleInputFileLinks.txt  | 12 ++++++++++++
 vendor/shodhka/test_20220812/dge_analysis.r      | 16 ++++++++++++++++
 vendor/shodhka/test_20220812/run.copr.sh         | 13 +++++++++++++
 4 files changed, 50 insertions(+)
 create mode 100644 vendor/shodhka/test_20220812/data/SampleGroups.txt
 create mode 100644 vendor/shodhka/test_20220812/data/SampleInputFileLinks.txt
 create mode 100755 vendor/shodhka/test_20220812/dge_analysis.r
 create mode 100755 vendor/shodhka/test_20220812/run.copr.sh

diff --git a/vendor/shodhka/test_20220812/data/SampleGroups.txt b/vendor/shodhka/test_20220812/data/SampleGroups.txt
new file mode 100644
index 0000000..69f5d8a
--- /dev/null
+++ b/vendor/shodhka/test_20220812/data/SampleGroups.txt
@@ -0,0 +1,9 @@
+Sample	Condition
+SRR15322680	Metastatic
+SRR15322681	Metastatic
+SRR15322682	Metastatic
+SRR15322683	Primary
+SRR15322684	Primary
+SRR15322685	Primary
+
+
diff --git a/vendor/shodhka/test_20220812/data/SampleInputFileLinks.txt b/vendor/shodhka/test_20220812/data/SampleInputFileLinks.txt
new file mode 100644
index 0000000..a938b0b
--- /dev/null
+++ b/vendor/shodhka/test_20220812/data/SampleInputFileLinks.txt
@@ -0,0 +1,12 @@
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/080/SRR15322680/SRR15322680_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/080/SRR15322680/SRR15322680_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/081/SRR15322681/SRR15322681_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/081/SRR15322681/SRR15322681_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/082/SRR15322682/SRR15322682_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/082/SRR15322682/SRR15322682_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/083/SRR15322683/SRR15322683_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/083/SRR15322683/SRR15322683_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/084/SRR15322684/SRR15322684_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/084/SRR15322684/SRR15322684_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/085/SRR15322685/SRR15322685_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR153/085/SRR15322685/SRR15322685_2.fastq.gz
diff --git a/vendor/shodhka/test_20220812/dge_analysis.r b/vendor/shodhka/test_20220812/dge_analysis.r
new file mode 100755
index 0000000..a02f605
--- /dev/null
+++ b/vendor/shodhka/test_20220812/dge_analysis.r
@@ -0,0 +1,16 @@
+#!/usr/bin/env Rscript
+
+library(edgeR)
+RawCounts <- read.delim("count_matrix.txt", row.names = "gene_id")
+group <- read.table("SampleGroups.txt", header=TRUE, sep="\t", row.names=1)
+dgecomplete <- DGEList(RawCounts, group = group$Condition)
+logcpm <- cpm(dgecomplete, log=TRUE)
+filtData <- filterByExpr(dgecomplete)
+dgecomplete <- dgecomplete[filtData, keep.lib.sizes=FALSE]
+dgecomplete <- calcNormFactors(dgecomplete)
+dgecomplete <- estimateDisp(y = dgecomplete)
+fit <- glmQLFit(y=dgecomplete)
+qlf <- glmQLFTest(fit, coef = 2)
+diff_results <- topTags(qlf, n=Inf)
+write.csv(diff_results, file="edgeR_diff_genes.csv")
+write.csv(as.data.frame(logcpm), file="edgeR_normcounts.csv")
diff --git a/vendor/shodhka/test_20220812/run.copr.sh b/vendor/shodhka/test_20220812/run.copr.sh
new file mode 100755
index 0000000..1e87706
--- /dev/null
+++ b/vendor/shodhka/test_20220812/run.copr.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+FILES=$(cat ./data/SampleInputFileLinks.txt)
+for file in $FILES; do
+    # --no-clobber, do not download if file already exists.
+    wget -nc $file
+done
+
+hisat2-build -p 8 genome.fa hg38_hisat
+hisat2 –x index -1 sample1_trim_R1.fq -2 sample1_trim_R2.fq –U unpair.fq –S sample1.sam
+htseq-count –r sample1.sam sample2.sam samplen.sam hs.gff > count_matrix.txt
+
+Rscript dge_analysis.r
-- 
GitLab