Skip to content
Snippets Groups Projects
Commit ccb99c2a authored by dilawar's avatar dilawar :ant:
Browse files

added pipelines.

parent 533e6a52
No related branches found
No related tags found
No related merge requests found
Showing
with 91047 additions and 0 deletions
To run this pipeline using `bitia`, Simply execute the following command in this
directory
```
$ bitia run <path_to_the analysis_directory>
```
**File and directory details**
- *run.bitia.sh* : Pipeline
- *input_file_links.txt* : Contains links to the sequencing data, which needs to be downloaded
- *sample_groups.txt* : Contains details of the control and experimental groups
- *reference_file_links* : Contains link to the reference genome, which needs to be downloaded
- *htseq-merge_all.r* : Script to merge count files
- *count_analysis.r* : Script for differential gene expression analysis
**R libraries required**
- *edgeR*
library(edgeR)
RawCounts <- read.delim("count_matrix_cleaned.txt", row.names = "ENSEMBL_GeneID")
group <- read.table("sample_groups.txt", header=TRUE, sep="\t", row.names=1)
dgecomplete <- DGEList(RawCounts, group = group$Condition)
logcpm <- cpm(dgecomplete, log=TRUE)
filtData <- filterByExpr(dgecomplete)
dgecomplete <- dgecomplete[filtData, keep.lib.sizes=FALSE]
dgecomplete <- calcNormFactors(dgecomplete)
dgecomplete <- estimateDisp(y = dgecomplete)
fit <- glmQLFit(y=dgecomplete)
qlf <- glmQLFTest(fit, coef = 2)
diff_results <- topTags(qlf, n=Inf)
write.csv(diff_results, file="edgeR_diff_genes.csv")
write.csv(as.data.frame(logcpm), file="edgeR_normcounts.csv")
# Functional analysis
library(org.Hs.eg.db)
library(GO.db)
row.names(qlf) <- keys(org.Hs.eg.db)[1:nrow(qlf)]
annot <- select(org.Hs.eg.db, row.names(qlf), c("ENTREZID"))
go <- goana(qlf, species="Hs")
kegg <- kegga(qlf, species="Hs")
write.csv(go, file="GO_all.csv")
write.csv(kegg, file="kegg_all.csv")
source diff could not be displayed: it is too large. Options to address this: view the blob.
source diff could not be displayed: it is too large. Options to address this: view the blob.
source diff could not be displayed: it is too large. Options to address this: view the blob.
source diff could not be displayed: it is too large. Options to address this: view the blob.
## R script for combine HTSeq outputs into one read count matrix in R
## Written by Ahmed Alhendi
## Usage example: Rscript htseq-combine_all.R <workdir> <Reacount_matrix_output_name>
args <- commandArgs(TRUE)
path <- as.character(args[1])
myoutname <-as.character(args[2])
print(paste0("############################"))
##Read files names
files <- list.files(path=path, pattern="*.txt")
print(sprintf("## Files to be merged are: ##"))
print(files)
print(paste0("############################"))
##Read files names
files <- list.files(path=path, pattern="*.txt")
print(sprintf("## Files to be merged are: ##"))
print(files)
print(paste0("############################"))
# using perl to manpulate file names by trimming file extension
labs <- paste("", gsub("\\.txt", "", files, perl=TRUE), sep="")
##Load all files to list object, use paste to return the trimpping parts to file name
print(sprintf("######### file read START ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
cov <- list()
for (i in labs) {
filepath <- file.path(path,paste(i,".txt",sep=""))
cov[[i]] <- read.table(filepath,sep = "\t", header=F, stringsAsFactors=FALSE)
colnames(cov[[i]]) <- c("ENSEMBL_GeneID", i)
}
print(sprintf("######### file read END ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
## construct one data frame from list of data.frames using reduce function
print(sprintf("######### merge START ######### %s",
format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
df <-Reduce(function(x,y) merge(x = x, y = y, by ="ENSEMBL_GeneID"), cov)
print(sprintf("######### merge END ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
print(sprintf("Exported merged table within work directory in txt and Rdata format with file name merged_%s_%s", make.names(format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")), myoutname))
write.table(df,paste(path, myoutname,".txt",sep=""), sep="\t", quote= F, row.names = F)
save.image(paste(path, myoutname,".Rdata",sep=""))
print(sprintf("######### MERGE FUNCTION COMPLETE ######### %s", format(Sys.time(),"%b_%d_%Y_%H_%M_%S_%Z")))
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/058/SRR20076358/SRR20076358_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/058/SRR20076358/SRR20076358_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/061/SRR20076361/SRR20076361_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/061/SRR20076361/SRR20076361_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/063/SRR20076363/SRR20076363_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/063/SRR20076363/SRR20076363_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/066/SRR20076366/SRR20076366_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/066/SRR20076366/SRR20076366_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/084/SRR20076384/SRR20076384_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/084/SRR20076384/SRR20076384_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/086/SRR20076386/SRR20076386_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/086/SRR20076386/SRR20076386_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/088/SRR20076388/SRR20076388_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/088/SRR20076388/SRR20076388_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/090/SRR20076390/SRR20076390_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/090/SRR20076390/SRR20076390_2.fastq.gz
\ No newline at end of file
http://igenomes.illumina.com.s3-website-us-east-1.amazonaws.com/Homo_sapiens/UCSC/hg38/Homo_sapiens_UCSC_hg38.tar.gz
#!/usr/bin/env bash
fastqc *.gz
for file in *_1.fastq.gz
do
trim_galore --cores 8 --gzip --fastqc --max_n 2 --output_dir Trimmed_Data/ --paired $file ${file%_1.fastq.gz}_2.fastq.gz
done
hisat2-build -p 8 ./Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa index
for file in Trimmed_Data/*_1_val_1.fq.gz
do
out_file=${file#Trimmed_Data/}
out_file=${out_file%_1_val_1.fq.gz}.sam
hisat2 -x index -1 $file -2 ${file%_1_val_1.fq.gz}_2_val_2.fq.gz -S $out_file
done
for file in *.sam
do
samtools sort -@ 8 -o ${file%.sam}.bam $file
done
mkdir ./htseq_counts
for file in *.bam
do
htseq-count -r name -s no -f bam $file ./Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf > htseq_counts/${file%.bam}_count.txt
done
Rscript htseq-merge_all.R htseq_counts/ count_matrix
sed '/^__/d' htseq_counts/count_matrix.txt > count_matrix_cleaned.txt
Rscript count_analysis.r
Sample Condition
SRR20076384 Automated
SRR20076386 Automated
SRR20076388 Automated
SRR20076390 Automated
SRR20076358 Suspension
SRR20076361 Suspension
SRR20076363 Suspension
SRR20076366 Suspension
\ No newline at end of file
To run this pipeline using `bitia`, Simply execute the following command in this
directory
```
$ bitia
```
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/096/SRR11862696/SRR11862696_1.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/096/SRR11862696/SRR11862696_2.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/097/SRR11862697/SRR11862697_1.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/097/SRR11862697/SRR11862697_2.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/099/SRR11862699/SRR11862699_1.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/099/SRR11862699/SRR11862699_2.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/082/SRR11862682/SRR11862682_1.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/082/SRR11862682/SRR11862682_2.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/091/SRR11862691/SRR11862691_1.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/091/SRR11862691/SRR11862691_2.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/092/SRR11862692/SRR11862692_1.fastq.gz
https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR118/092/SRR11862692/SRR11862692_2.fastq.gz
#!/usr/bin/env bash
set -x
set -e
# execute: pip install cutadapt
ls -ltra .
pwd
bitia tools download ./SampleInputFileLinks.txt
python -m pip install cutadapt
# SRR11862696_1.fastq.gz:https://blah.blah.com/v1.fastaqc1.gz
trim_galore --gzip --fastqc --max_n 2 --paired --length 50 \
$(readlink SRR11862696_1.fastq.gz) $(readlink SRR11862696_2.fastq.gz)
fastqc *fastq.gz
#!/usr/bin/env bash
#fastqc *.gz
bitia tools download ./small_input_files.txt
for file in *_1.fastq.gz
do
echo $file ${file%_1.fastq.gz}_2.fastq.gz
trim_galore --cores 8 --gzip --fastqc --max_n 2 \
--output_dir Trimmed_Data/ \
--paired $file ${file%_1.fastq.gz}_2.fastq.gz
done
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/058/SRR20076358/SRR20076358_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/058/SRR20076358/SRR20076358_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/061/SRR20076361/SRR20076361_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/061/SRR20076361/SRR20076361_2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/063/SRR20076363/SRR20076363_1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR200/063/SRR20076363/SRR20076363_2.fastq.gz
File added
#!/usr/bin/env bash
echo "It does nothing"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment