Commit e360f66e authored by Lindner's avatar Lindner
Browse files

Initial commit

parents
.snakemake/
genome
logs/
out/
Snakefiles_OldVersions/
trimmed_data/
# import Bio
import csv
# make dictionary for adding read groups to alignments:
# for how to make .csv files, see make_csv_for_dictionary.R
new_data_dict = {}
with open("src/Run_seq2018.csv", 'r') as data_file:
data = csv.DictReader(data_file, delimiter=",")
for row in data:
item = new_data_dict.get(row["Sample"], dict())
item[row["Key"]] = row["Value"]
new_data_dict[row["Sample"]] = item
print(new_data_dict)
dict_seq2018 = new_data_dict
new_data_dict = {}
with open("src/Run_reseq2020.csv", 'r') as data_file:
data = csv.DictReader(data_file, delimiter=",")
for row in data:
item = new_data_dict.get(row["Sample"], dict())
item[row["Key"]] = row["Value"]
new_data_dict[row["Sample"]] = item
print(new_data_dict)
dict_reseq2020 = new_data_dict
dict_Readgroup = dict(seq2018 = dict_seq2018, reseq2020 = dict_reseq2020)
# define wildcards:
SAMPLE, = glob_wildcards("trimmed_data/reseq2020/{sample}_R1_val_1.fq.gz")
RUN = ["seq2018", "reseq2020"]
# rule all:
# input:
# expand("trimmed_data/{run}/{sample}_R1_val_1.fq.gz", sample=SAMPLE)
# rule all:
# input:
# expand("out/{run}/alignment/{sample}.deduplicated.bam", sample=SAMPLE, run=RUN)
rule all:
input:
expand("out/merged/alignment/{sample}.deduplicated_merged.bam", sample=SAMPLE)
# build genome
rule genome:
input:
"genome/genome.fna"
output:
"genome/genome.fna_bowtie2/NC_031784.1.data"
conda:
"src/env_BS_pipeline.yml"
log:
"logs/genome.log"
threads: 1
shell:
"(bs_seeker2-build.py -f {input} --aligner=bowtie2 -d genome) 2> {log}"
# alignment
rule align:
input:
R1Tr="trimmed_data/{run}/{sample}_R1_val_1.fq.gz",
R2Tr="trimmed_data/{run}/{sample}_R2_val_2.fq.gz",
Genome="genome/genome.fna_bowtie2/NC_031784.1.data"
output:
"out/{run}/alignment/{sample}.bam"
conda:
"src/env_BS_pipeline.yml"
log:
"logs/{run}/{sample}_align.log"
threads: 8
shell:
"(bs_seeker2-align.py -g genome/genome.fna -d genome -1 {input.R1Tr} -2 {input.R2Tr} -f bam --aligner=bowtie2 -o {output} --bt2--end-to-end --temp_dir=out/{wildcards.run}/temp/ --bt2-p 4) 2> {log}"
# sort alignment
rule piSort:
input:
"out/{run}/alignment/{sample}.bam"
output:
"out/{run}/alignment/{sample}.sorted.bam"
conda:
"src/env_BS_pipeline.yml"
log:
"logs/{run}/{sample}_sort.log"
threads: 1
shell:
"(picard SortSam I={input} O={output} SORT_ORDER=queryname) 2> {log}"
# deduplication
rule piDedup:
input:
"out/{run}/alignment/{sample}.sorted.bam"
output:
"out/{run}/alignment/{sample}.deduplicated.bam"
conda:
"src/env_BS_pipeline.yml"
log:
"logs/{run}/{sample}_dedup.log"
threads: 1
shell:
"(picard MarkDuplicates I={input} O={output} M={output}.metrics REMOVE_DUPLICATES=true OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 VALIDATION_STRINGENCY=LENIENT MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=100) 2> {log}"
# add read group to samples; use dict_Readgroup
rule readgr:
input:
"out/{run}/alignment/{sample}.deduplicated.bam",
output:
"out/{run}/alignment/{sample}.deduplicated.withRG.bam"
params:
Lane=lambda wildcards: dict_Readgroup[wildcards.run][wildcards.sample]['Lane'],
Barcode=lambda wildcards: dict_Readgroup[wildcards.run][wildcards.sample]['Barcode'],
Library=lambda wildcards: dict_Readgroup[wildcards.run][wildcards.sample]['Library'],
Flowcell=lambda wildcards: dict_Readgroup[wildcards.run][wildcards.sample]['Flowcell']
conda:
"src/env_BS_pipeline.yml"
log:
"logs/{run}/{sample}_RG.log"
threads: 1
shell:
"(picard AddOrReplaceReadGroups I={input} O={output} ID={params.Flowcell}.{params.Lane} LB=KapaBiosystems-{params.Library} PL=illumina PU={params.Flowcell}.{params.Lane}.{params.Barcode} SM={wildcards.sample} CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT SORT_ORDER=queryname) 2> {log}"
# merge files
rule merge:
input:
R2018="out/seq2018/alignment/{sample}.deduplicated.withRG.bam",
R2020="out/reseq2020/alignment/{sample}.deduplicated.withRG.bam"
output:
"out/merged/alignment/{sample}.deduplicated_merged.bam"
conda:
"src/env_BS_pipeline.yml"
log:
"logs/{sample}_merge.log"
threads: 1
shell:
"(picard MergeSamFiles I={input.R2018} I={input.R2020} O={output} SORT_ORDER=queryname) 2> {log}"
/home/NIOO.INT/melaniel/projects/WGBS_Snakemake_Bismark/src
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment