added pipelines.

ccb99c2a · dilawar · 533e6a52 · ccb99c2a · ccb99c2a · ccb99c2a
Commit ccb99c2a authored 2 years ago by dilawar
--- a/pipeline_dge/README.md
+++ b/pipeline_dge/README.md
+To run this pipeline using `bitia`, Simply execute the following command in this
+directory
+```
+$ bitia run <path_to_the analysis_directory>
+```
+**File and directory details**
+- *run.bitia.sh* : Pipeline
+- *input_file_links.txt* : Contains links to the sequencing data, which needs to be downloaded
+- *sample_groups.txt* : Contains details of the control and experimental groups
+- *reference_file_links* : Contains link to the reference genome, which needs to be downloaded
+- *htseq-merge_all.r* : Script to merge count files
+- *count_analysis.r* : Script for differential gene expression analysis
+**R libraries required**
+- *edgeR*
--- a/pipeline_dge/count_analysis.r
+++ b/pipeline_dge/count_analysis.r
+library(edgeR)
+RawCounts <- read.delim("count_matrix_cleaned.txt", row.names = "ENSEMBL_GeneID")
+group <- read.table("sample_groups.txt", header=TRUE, sep="\t", row.names=1)
+dgecomplete <- DGEList(RawCounts, group = group$Condition)
+logcpm <- cpm(dgecomplete, log=TRUE)
+filtData <- filterByExpr(dgecomplete)
+dgecomplete <- dgecomplete[filtData, keep.lib.sizes=FALSE]
+dgecomplete <- calcNormFactors(dgecomplete)
+dgecomplete <- estimateDisp(y = dgecomplete)
+fit <- glmQLFit(y=dgecomplete)
+qlf <- glmQLFTest(fit, coef = 2)
+diff_results <- topTags(qlf, n=Inf)
+write.csv(diff_results, file="edgeR_diff_genes.csv")
+write.csv(as.data.frame(logcpm), file="edgeR_normcounts.csv")
+# Functional analysis
+library(org.Hs.eg.db)
+library(GO.db)
+row.names(qlf) <- keys(org.Hs.eg.db)[1:nrow(qlf)]
+annot <- select(org.Hs.eg.db, row.names(qlf), c("ENTREZID"))
+go <- goana(qlf, species="Hs")
+kegg <- kegga(qlf, species="Hs")
+write.csv(go, file="GO_all.csv")
+write.csv(kegg, file="kegg_all.csv")
--- a/pipeline_dge/expected_output/count_matrix.txt
+++ b/pipeline_dge/expected_output/count_matrix.txt
--- a/pipeline_dge/expected_output/count_matrix_cleaned.txt
+++ b/pipeline_dge/expected_output/count_matrix_cleaned.txt
--- a/pipeline_dge/expected_output/edgeR_diff_genes.csv
+++ b/pipeline_dge/expected_output/edgeR_diff_genes.csv
--- a/pipeline_dge/expected_output/edgeR_normcounts.csv
+++ b/pipeline_dge/expected_output/edgeR_normcounts.csv
--- a/pipeline_dge/htseq-merge_all.r
+++ b/pipeline_dge/htseq-merge_all.r
+## R script for combine HTSeq outputs into one read count matrix in R
+## Written by Ahmed Alhendi
+## Usage example:  Rscript htseq-combine_all.R <workdir> <Reacount_matrix_output_name>
+args <- commandArgs(TRUE)
+path <- as.character(args[1])
+myoutname <-as.character(args[2])
+print(paste0("############################"))
+##Read files names
+files <- list.files(path=path, pattern="*.txt")
+print(sprintf("## Files to be merged are: ##"))
+print(files)
+print(paste0("############################"))
+##Read files names
+files <- list.files(path=path, pattern="*.txt")
+print(sprintf("## Files to be merged are: ##"))
+print(files)
+print(paste0("############################"))
+# using perl to manpulate file names by trimming file extension
+labs <- paste("", gsub("\\.txt", "", files, perl=TRUE), sep="")
+##Load all files to list object, use paste to return the trimpping parts to file name
+print(sprintf("######### file read START ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
+cov <- list()
+for (i in labs) {
+filepath <- file.path(path,paste(i,".txt",sep=""))
+cov[[i]] <- read.table(filepath,sep = "\t", header=F, stringsAsFactors=FALSE)
+colnames(cov[[i]]) <- c("ENSEMBL_GeneID", i)
+}
+print(sprintf("######### file read END ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
+## construct one data frame from list of data.frames using reduce function
+print(sprintf("######### merge START ######### %s", 
+format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
+df <-Reduce(function(x,y) merge(x = x, y = y, by ="ENSEMBL_GeneID"), cov)
+print(sprintf("######### merge END ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
+print(sprintf("Exported merged table within work directory in txt and Rdata format with file name merged_%s_%s", make.names(format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")), myoutname))
+write.table(df,paste(path, myoutname,".txt",sep=""), sep="\t", quote= F, row.names = F)
+save.image(paste(path, myoutname,".Rdata",sep=""))
+print(sprintf("######### MERGE FUNCTION COMPLETE ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
--- a/pipeline_dge/input_file_links.txt
+++ b/pipeline_dge/input_file_links.txt
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/058/SRR20076358/SRR20076358_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/058/SRR20076358/SRR20076358_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/061/SRR20076361/SRR20076361_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/061/SRR20076361/SRR20076361_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/063/SRR20076363/SRR20076363_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/063/SRR20076363/SRR20076363_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/066/SRR20076366/SRR20076366_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/066/SRR20076366/SRR20076366_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/084/SRR20076384/SRR20076384_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/084/SRR20076384/SRR20076384_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/086/SRR20076386/SRR20076386_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/086/SRR20076386/SRR20076386_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/088/SRR20076388/SRR20076388_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/088/SRR20076388/SRR20076388_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/090/SRR20076390/SRR20076390_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/090/SRR20076390/SRR20076390_2.fastq.gz
\ No newline at end of file
--- a/pipeline_dge/reference_file_links.txt
+++ b/pipeline_dge/reference_file_links.txt
+http://igenomes.illumina.com.s3-website-us-east-1.amazonaws.com/Homo_sapiens/UCSC/hg38/Homo_sapiens_UCSC_hg38.tar.gz
--- a/pipeline_dge/run.bitia.sh
+++ b/pipeline_dge/run.bitia.sh
+#!/usr/bin/env bash
+fastqc *.gz
+for file in *_1.fastq.gz
+do
+  trim_galore --cores 8 --gzip --fastqc --max_n 2 --output_dir Trimmed_Data/ --paired $file ${file%_1.fastq.gz}_2.fastq.gz
+done
+hisat2-build -p 8 ./Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa index
+for file in Trimmed_Data/*_1_val_1.fq.gz
+do
+  out_file=${file#Trimmed_Data/}
+  out_file=${out_file%_1_val_1.fq.gz}.sam
+  hisat2 -x index -1 $file -2 ${file%_1_val_1.fq.gz}_2_val_2.fq.gz -S $out_file
+done
+for file in *.sam
+do
+  samtools sort -@ 8 -o ${file%.sam}.bam $file
+done
+mkdir ./htseq_counts
+for file in *.bam
+do
+  htseq-count -r name -s no -f bam $file ./Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf > htseq_counts/${file%.bam}_count.txt
+done
+Rscript htseq-merge_all.R htseq_counts/ count_matrix
+sed '/^__/d' htseq_counts/count_matrix.txt > count_matrix_cleaned.txt
+Rscript count_analysis.r
--- a/pipeline_dge/sample_groups.txt
+++ b/pipeline_dge/sample_groups.txt
+Sample	Condition
+SRR20076384	Automated
+SRR20076386	Automated
+SRR20076388	Automated
+SRR20076390	Automated
+SRR20076358	Suspension
+SRR20076361	Suspension
+SRR20076363	Suspension
+SRR20076366	Suspension
\ No newline at end of file
--- a/pipeline_huge_01/README.md
+++ b/pipeline_huge_01/README.md
+To run this pipeline using `bitia`, Simply execute the following command in this
+directory
+```
+$ bitia
+```
--- a/pipeline_huge_01/SampleInputFileLinks.txt
+++ b/pipeline_huge_01/SampleInputFileLinks.txt
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/096/SRR11862696/SRR11862696_1.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/096/SRR11862696/SRR11862696_2.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/097/SRR11862697/SRR11862697_1.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/097/SRR11862697/SRR11862697_2.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/099/SRR11862699/SRR11862699_1.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/099/SRR11862699/SRR11862699_2.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/082/SRR11862682/SRR11862682_1.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/082/SRR11862682/SRR11862682_2.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/091/SRR11862691/SRR11862691_1.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/091/SRR11862691/SRR11862691_2.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/092/SRR11862692/SRR11862692_1.fastq.gz
+https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/092/SRR11862692/SRR11862692_2.fastq.gz
--- a/pipeline_huge_01/run.bitia.sh
+++ b/pipeline_huge_01/run.bitia.sh
+#!/usr/bin/env bash
+set -x
+set -e
+# execute: pip install cutadapt
+ls -ltra .
+pwd
+bitia tools download ./SampleInputFileLinks.txt
+python -m pip install cutadapt
+# SRR11862696_1.fastq.gz:https://blah.blah.com/v1.fastaqc1.gz
+trim_galore --gzip --fastqc --max_n 2 --paired --length 50 \
+    $(readlink SRR11862696_1.fastq.gz) $(readlink SRR11862696_2.fastq.gz)
+fastqc *fastq.gz
--- a/pipeline_medium_01/main.bitia.sh
+++ b/pipeline_medium_01/main.bitia.sh
+#!/usr/bin/env bash
+#fastqc *.gz
+bitia tools download ./small_input_files.txt
+for file in *_1.fastq.gz
+do
+  echo $file ${file%_1.fastq.gz}_2.fastq.gz
+  trim_galore --cores 8 --gzip --fastqc --max_n 2 \
+      --output_dir Trimmed_Data/ \
+      --paired $file ${file%_1.fastq.gz}_2.fastq.gz
+done
--- a/pipeline_medium_01/small_input_files.txt
+++ b/pipeline_medium_01/small_input_files.txt
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/058/SRR20076358/SRR20076358_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/058/SRR20076358/SRR20076358_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/061/SRR20076361/SRR20076361_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/061/SRR20076361/SRR20076361_2.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/063/SRR20076363/SRR20076363_1.fastq.gz
+ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/063/SRR20076363/SRR20076363_2.fastq.gz
--- a/pipeline_small_01.zip
+++ b/pipeline_small_01.zip
--- a/pipeline_small_01/main.bitia.sh
+++ b/pipeline_small_01/main.bitia.sh
+#!/usr/bin/env bash
+echo "It does nothing"