################################################################
## Anlayze groups of peaks obtained by clustering peaks on the basis
## of read density profiles around the summits.

#include makefiles/01_peak_calling.mk
include makefiles/02_motifs.mk
MAKEFILE=makefiles/04_peak_cluster_analysis.mk

GENOME=mm9

################################################################
## List the clusters
CLUSTER_QUANT=99
CLUSTER_KB=7
CLUSTER_BIN=100
CLUSTER_DIR=analysis/peaks_fev2013/mes_hm/${CLUSTER_KB}kb/q${CLUSTER_QUANT}
CLUSTER_FILES=`ls ${CLUSTER_DIR}/*_ordered3_cluster*.bed | perl -pe 's|\.bed||g' | perl -pe 's|${CLUSTER_DIR}/||g'`
list_clusters:
	@echo "Cluster dir	${CLUSTER_DIR}"
	@echo "${CLUSTER_FILES}"

################################################################
## Iterate over all clusters
CLUSTER_NUMBERS=20 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
CLUSTER_TASK=fetch_cluster
iterate_clusters:
	@echo "Iterating over cluster files"
	@for n in ${CLUSTER_NUMBERS}; do \
		${MAKE} CLUSTER_NB=$${n} ${CLUSTER_TASK}; \
	done

cluster_param:
	@echo "CLUSTER_NB    	${CLUSTER_NB}"
	@echo "CLUSTER_DIR	${CLUSTER_DIR}"
	@echo "CLUSTER_PREFIX	${CLUSTER_PREFIX}"
	@echo "CLUSTER_FILE	${CLUSTER_FILE}"
	@echo "CLUSTER_SEQ	${CLUSTER_SEQ}"


################################################################
## Fetch sequences for UCSC for one peak cluster
CLUSTER_NB=5
CLUSTER_PREFIX=mes_swembl_002_100_${CLUSTER_KB}kb_ordered3_cluster${CLUSTER_NB}
CLUSTER_FILE=${CLUSTER_DIR}/${CLUSTER_PREFIX}
CLUSTER_SEQ=${CLUSTER_DIR}/${CLUSTER_PREFIX}_summit_ext1000
fetch_cluster:
	@echo
	@echo "Fetching sequence for cluster ${CLUSTER_PREFIX}	${CLUSTER_FILE}"
	@awk '{print $$1"\t"$$4-1"\t"$$4}' ${CLUSTER_FILE}.bed \
		| fetch-sequences -v ${V} -genome mm9 -extend 1000 \
		${OPT} \
		-o ${CLUSTER_SEQ}.fasta
	@echo "	${CLUSTER_SEQ}.fasta"

################################################################
## Run position-analysis on one peak cluster
CLUSTER_POS_DIR=${CLUSTER_DIR}/motifs/position_analysis
cluster_pos:
	@mkdir -p ${CLUSTER_POS_DIR}
	@${MAKE} V=2 positions POS_SEQ=${CLUSTER_SEQ}.fasta POS_BASE_DIR=${CLUSTER_POS_DIR} POS_SEQ_PREFIX=${CLUSTER_PREFIX}_summit_ext1000

################################################################
## Run position-analysis on one peak cluster using a Markov chain background model
CLUSTER_POS_DIR=${CLUSTER_DIR}/motifs/position_analysis
cluster_pos_markov:
	@mkdir -p ${CLUSTER_POS_DIR}
	@${MAKE} V=2 positions_markov POS_SEQ=${CLUSTER_SEQ}.fasta POS_BASE_DIR=${CLUSTER_POS_DIR} POS_SEQ_PREFIX=${CLUSTER_PREFIX}_summit_ext1000

## Cluster position-analysis profiles on clusters of SWEMBL peaks
## produced by clustering peaks on the basis of read density profiles
# positions_clusters_swembl_peak_clusters:
# 	${MAKE} V=2 positions_clusters POS_SEQ_DIR=${CLUSTER_DIR} POS_SEQ=${CLUSTER_SEQ}.fasta POS_BASE_DIR=${CLUSTER_POS_DIR} POS_SEQ_PREFIX=${CLUSTER_PREFIX}_summit_ext1000

## Iterate position-analysis over all clusters and all oligo lengths
CLUSTER_POS_TASK=cluster_pos
cluster_pos_all:
	${MAKE} iterate_clusters CLUSTER_TASK=positions_all_lengths POS_TASK=${CLUSTER_POS_TASK}

################################################################
## Scan cluster peaks to detect OGRE sites, using the motif provided
## by Christelle Cayrou.
#OGRE_PSSM=data/motifs/ogre-like.tf
#OGRE_MOTIF_PREFIX=ogre-like
OGRE_MOTIF_PREFIX=ogre_cayrou
OGRE_PSSM=data/motifs/ogre_cayrou.tf
CLUSTER_OGRE_DIR=${CLUSTER_DIR}/motifs/${OGRE_MOTIF_PREFIX}_sites
UTH_PVAL=1e-4
SCAN_STR=-2str
CLUSTER_OGRE_SITES=${CLUSTER_OGRE_DIR}/${CLUSTER_PREFIX}_${OGRE_MOTIF_PREFIX}_sites_pval${UTH_PVAL}${SCAN_STR}
ogre_scan_one_cluster:
	@echo
	@echo "Scanning OGRE in cluster ${CLUSTENB}	${CLUSTER_PREFIX}"
	@mkdir -p ${CLUSTER_OGRE_DIR}
	matrix-scan -v ${V} -quick \
		-i ${CLUSTER_SEQ}.fasta ${SCAN_STR} \
		-origin center \
		-matrix_format tf -m ${OGRE_PSSM} \
		-bginput -markov 1 -uth pval ${UTH_PVAL} \
		-o ${CLUSTER_OGRE_SITES}.ft
	@echo "	${CLUSTER_OGRE_SITES}.ft"
	@${MAKE} ogre_distrib_one_cluster

ogre_distrib_one_cluster:
	@${MAKE} ogre_pos_distrib  OGRE_SITES=${CLUSTER_OGRE_SITES} OGRE_SEQ=${CLUSTER_SEQ} OGRE_SEQ_DIR=${CLUSTER_DIR} OGRE_SEQ_PREFIX=${CLUSTER_PREFIX}_summit_ext1000
	@${MAKE} ogre_clean_features OGRE_SITES=${CLUSTER_OGRE_SITES} OGRE_SEQ=${CLUSTER_SEQ} OGRE_SEQ_DIR=${CLUSTER_DIR} OGRE_SEQ_PREFIX=${CLUSTER_PREFIX}_summit_ext1000

################################################################
## remove the feature file, which occupies too much space 
#ogre_clean_features:
#	@echo "Cleaning feature file ${CLUSTER_OGRE_SITES}.ft"
#	rm -f ${CLUSTER_OGRE_SITES}.ft

# ################################################################
# ## Compute position distribution of OGRE sites in peak cluster.
# ## We scan on both strands, but we then generate a separate
# ## distribution profile for each strand.
# DISTRIB_IMG_FORMAT=pdf
# #		-ymin 0 -ymax ${OGRE_YMAX} \
# #			-ymin 0 -ymax ${OGRE_YMAX} \
# #OGRE_YMAX=5000
# POS_CI=50
# ogre_pos_distrib:
# 	classfreq -v 1 -i ${CLUSTER_OGRE_SITES}.ft -col 5 -ci ${POS_CI} -o ${CLUSTER_OGRE_SITES}_distrib.tab
# 	@echo "	${CLUSTER_OGRE_SITES}_distrib.tab"
# 	@XYgraph -i ${CLUSTER_OGRE_SITES}_distrib.tab \
# 		-format ${DISTRIB_IMG_FORMAT} \
# 		-xcol 3 -ycol 4 -lines -xsize 800 -ysize 400 \
# 		-title1 'OGRE sites; ${CLUSTER_PREFIX}' \
# 		-xmin -1000 -xmax 1000 \
# 		-yleg1 'Number of sites' \
# 		-xleg1 'position' \
# 		-pointsize 0 \
# 		-o ${CLUSTER_OGRE_SITES}_distrib.${DISTRIB_IMG_FORMAT}
# 	@echo "	${CLUSTER_OGRE_SITES}_distrib.${DISTRIB_IMG_FORMAT}"

# 	@${MAKE} _ogre_pos_distrib_one_strand DISTRIB_STRAND=D
# 	@${MAKE} _ogre_pos_distrib_one_strand DISTRIB_STRAND=R

# _ogre_pos_distrib_one_strand:
# 	@awk '$$4 == "${DISTRIB_STRAND}"' ${CLUSTER_OGRE_SITES}.ft \
# 		| classfreq -v 1 -col 5 -ci ${POS_CI} -o ${CLUSTER_OGRE_SITES}_${DISTRIB_STRAND}_distrib.tab ; \
# 		XYgraph -i  ${CLUSTER_OGRE_SITES}_${DISTRIB_STRAND}_distrib.tab \
# 			-format ${DISTRIB_IMG_FORMAT} \
# 			-xcol 3 -ycol 4 -lines -xsize 800 -ysize 400 \
# 			-title1 'OGRE sites; ${CLUSTER_PREFIX}' \
# 			-title2 'Strand ${DISTRIB_STRAND}' \
# 			-xmin -1000 -xmax 1000 \
# 			-yleg1 'Number of sites' \
# 			-xleg1 'position' \
# 			-pointsize 0 \
# 			-o ${CLUSTER_OGRE_SITES}_${DISTRIB_STRAND}_distrib.${DISTRIB_IMG_FORMAT}
# 	@echo "	${CLUSTER_OGRE_SITES}_${DISTRIB_STRAND}_distrib.${DISTRIB_IMG_FORMAT}"

################################################################
## Run motif discovery on each cluster using  MEME
CLUSTER_MEME_DIR=${CLUSTER_DIR}/motifs/meme/${CLUSTER_PREFIX}_meme_top${MEME_TOP}seq
MEME_MINW=10
MEME_MAXW=15
MEME_NMOTIFS=3
MEME_TOP=300
CLUSTER_MEME_SEQ=${CLUSTER_SEQ}_top${MEME_TOP}seq.fasta
MEME_CMD=meme ${CLUSTER_MEME_SEQ} \
	-nmotifs ${MEME_NMOTIFS} \
	-evt 1 -minw ${MEME_MINW} -maxw ${MEME_MAXW} -revcomp -dna -mod anr -maxsize 1000000 \
	-oc ${CLUSTER_MEME_DIR} 
meme_one_cluster:
	@mkdir -p ${CLUSTER_MEME_DIR}
	convert-seq -i ${CLUSTER_SEQ}.fasta -from fasta -to fasta -top ${MEME_TOP} -o ${CLUSTER_MEME_SEQ}
	@${MAKE} my_command MY_COMMAND="${MEME_CMD}"
