pkgsrc-WIP-changes archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

ddocent: replace ddocent-test* with ddocent-assembly-test*



Module Name:	pkgsrc-wip
Committed By:	Winston Weinert <winston%ml1.net@localhost>
Pushed By:	winston
Date:		Mon Jun 18 10:49:40 2018 -0500
Changeset:	78d5aaf8432b04e92f30d2e87625b1274fdc2743

Modified Files:
	ddocent/Makefile
	ddocent/PLIST
Added Files:
	ddocent/files/ddocent-assembly-test
	ddocent/files/ddocent-assembly-test-cleanup
Removed Files:
	ddocent/files/ddocent-test

Log Message:
ddocent: replace ddocent-test* with ddocent-assembly-test*

To see a diff of this commit:
https://wip.pkgsrc.org/cgi-bin/gitweb.cgi?p=pkgsrc-wip.git;a=commitdiff;h=78d5aaf8432b04e92f30d2e87625b1274fdc2743

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

diffstat:
 ddocent/Makefile                            |   7 +-
 ddocent/PLIST                               |   4 +-
 ddocent/files/ddocent-assembly-test         | 366 ++++++++++++++++++++++++++++
 ddocent/files/ddocent-assembly-test-cleanup |   8 +
 ddocent/files/ddocent-test                  | 227 -----------------
 5 files changed, 380 insertions(+), 232 deletions(-)

diffs:
diff --git a/ddocent/Makefile b/ddocent/Makefile
index 4363cd996a..b6e9210ac0 100644
--- a/ddocent/Makefile
+++ b/ddocent/Makefile
@@ -49,7 +49,7 @@ SUBST_FILES.prefix+=	${DESTDIR}${PREFIX}/bin/dDocent
 
 USE_LANGUAGES=	# none
 REPLACE_BASH=	dDocent *.sh
-REPLACE_BASH+=	scripts/ddocent-test* scripts/*.sh scripts/dDocent_filters
+REPLACE_BASH+=	scripts/ddocent-assembly-test* scripts/*.sh scripts/dDocent_filters
 REPLACE_PERL=	scripts/*.pl
 REPLACE_PYTHON=	scripts/*.py
 NO_BUILD=	yes
@@ -63,14 +63,15 @@ JAVAJARDIR=	${PREFIX}/lib/java
 INSTALLATION_DIRS=	bin
 
 post-extract:
-	${CP} ${FILESDIR}/ddocent-test ${FILESDIR}/ddocent-test-cleanup \
+	${CP} ${FILESDIR}/ddocent-assembly-test ${FILESDIR}/ddocent-assembly-test-cleanup \
 		${WRKSRC}/scripts
 
 do-install:
 	${INSTALL_SCRIPT} \
 		${WRKSRC}/dDocent \
 		${WRKSRC}/*.sh \
-		${WRKSRC}/scripts/ddocent-test* \
+		${WRKSRC}/scripts/ddocent-assembly-test \
+		${WRKSRC}/scripts/ddocent-assembly-test-cleanup \
 		${WRKSRC}/scripts/*.sh \
 		${WRKSRC}/scripts/*.pl \
 		${WRKSRC}/scripts/dDocent_filters \
diff --git a/ddocent/PLIST b/ddocent/PLIST
index 9de3aea86c..f44045003c 100644
--- a/ddocent/PLIST
+++ b/ddocent/PLIST
@@ -8,8 +8,8 @@ bin/Rename_for_dDocent_with-INDEX.sh
 bin/dDocent
 bin/dDocent_filters
 bin/dDocent_ngs.sh
-bin/ddocent-test
-bin/ddocent-test-cleanup
+bin/ddocent-assembly-test
+bin/ddocent-assembly-test-cleanup
 bin/filter_hwe_by_pop.pl
 bin/filter_missing_ind.sh
 bin/pop_missing_filter.sh
diff --git a/ddocent/files/ddocent-assembly-test b/ddocent/files/ddocent-assembly-test
new file mode 100644
index 0000000000..7696b900cf
--- /dev/null
+++ b/ddocent/files/ddocent-assembly-test
@@ -0,0 +1,366 @@
+#!/usr/bin/env bash
+
+set -e
+
+##########################################################################
+#   Function description:
+#       Pause until user presses return
+##########################################################################
+
+pause()
+{
+    local junk
+    
+    printf "Press return to continue..."
+    read junk
+}
+
+
+##########################################################################
+#   Not necessary if invoking scripts with "bash script-name" as shown
+#   in the dDocent tutorial, but supplied for completeness.  Tutorial
+#   scripts have also been patched upstream to usr /usr/bin/env, but
+#   we won't assume they'll all stay that way.
+##########################################################################
+
+fix_bash_path()
+{
+    # Don't echo these commands
+    { set +x; } 2>/dev/null
+    if [ $# != 1 ]; then
+	printf "Usage: $0 script\n"
+	exit 1
+    fi
+    sed -i.bak -e "s|#!/bin/bash|#!/usr/bin/env bash|g" $1
+    set -x
+}
+
+
+##########################################################################
+#   Main
+##########################################################################
+
+##########################################################################
+#   Workarounds for running dDocent from FreeBSD ports and Pkgsrc.
+#   Include this section in any scripts running dDocent commands.
+##########################################################################
+
+# Hack to allow remake_reference.sh, etc. to find trimmomatic.
+# trimmomatic.jar and adapter files are not an executables and should not
+# be in PATH according to filesystem hierarchy standards, but the dDocent
+# scripts are coded to look for them there, because that's how dDocent
+# installs the bundled Trimmomatic.
+if [ -e /usr/local/share/java/classes/trimmomatic.jar ]; then
+    export PATH=${PATH}:/usr/local/share/java/classes:/usr/local/share/trimmomatic/adapters
+elif [ -e $PKGSRC/lib/java/trimmomatic.jar ]; then
+    export PATH=${PATH}:$PKGSRC/lib/java:$PKGSRC/share/trimmomatic/adapters
+else
+    printf "Error: Trimmomatic is not installed in a known location.\n" >> /dev/stderr
+fi
+
+if ! pwd | fgrep dDocent-test; then
+    mkdir -p dDocent-test
+    cd dDocent-test
+fi
+
+# remake_reference.sh and some other scripts use GNU extensions in awk
+# commands.  Trick systems into using gawk to get around this without
+# having to patch every script.
+mkdir -p ddocent-bin
+ln -sf `which gawk` ddocent-bin/awk
+ln -sf `which python2.7` ddocent-bin/python
+export PATH=`pwd`/ddocent-bin:$PATH
+
+##########################################################################
+#   Actual dDocent commands below
+##########################################################################
+
+##########################################################################
+#   Command sequence from the dDocent tutorial on Github.  See link below.
+##########################################################################
+
+cat << EOM
+
+This script runs the test commands described at
+
+http://ddocent.com/assembly
+
+You may want to follow along on this web page as the script runs.  It will
+pause after each step to allow checking the output.
+
+EOM
+pause
+
+mkdir -p Data
+cd Data
+
+printf "Downloading and unpacking data.zip...\n"
+if [ -e data.zip ]; then
+    mv data.zip /tmp/dDocent-data.zip
+    rm -rf *
+    mv /tmp/dDocent-data.zip data.zip
+else
+    rm -rf *
+    curl --insecure -L -o data.zip \
+	'https://www.dropbox.com/s/t09xjuudev4de72/data.zip?dl=0'
+fi
+
+# Hack: current data.zip contains data.zip.  ??!
+if [ ! -e simRRLs2.py ]; then
+    yes | unzip data.zip || true
+fi
+ls -l
+pause
+
+set -x
+head SimRAD.barcodes
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cut -f2 SimRAD.barcodes > barcodes
+head barcodes
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+process_radtags -1 SimRAD_R1.fastq.gz -2 SimRAD_R2.fastq.gz -b barcodes \
+    -e ecoRI --renz_2 mspI -r -i gzfastq
+ls | fgrep sample_ | head
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rm *rem*
+{ set +x; } 2>/dev/null
+pause
+
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+Rename_for_dDocent.sh SimRAD.barcodes
+{ set +x; } 2>/dev/null
+
+set -x
+ls *.fq.gz
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+ls *.F.fq.gz > namelist
+sed -i'' -e 's/.F.fq.gz//g' namelist
+AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
+AWK2='!/>/'
+AWK3='!/NNN/'
+PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
+
+cat namelist | parallel --no-notice -j 8 "zcat {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
+cat namelist | parallel --no-notice -j 8 "zcat {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
+cat namelist | parallel --no-notice -j 8 "paste -d '-' {}.forward {}.reverse | mawk '$AWK3' | sed 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cat *.uniq.seqs > uniq.seqs
+for i in {2..20};
+do 
+echo $i >> pfile
+done
+cat pfile | parallel --no-notice \
+    "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniq.seqs |  wc -l" \
+    | mawk  '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.data
+rm pfile
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+more uniqseq.data
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+gnuplot << \EOF 
+set terminal dumb size 80, 30
+set autoscale
+set xrange [2:20] 
+unset label
+set title "Number of Unique Sequences with More than X Coverage (Counted within individuals)"
+set xlabel "Coverage"
+set ylabel "Number of Unique Sequences"
+plot 'uniqseq.data' with lines notitle
+pause -1
+EOF
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+parallel --no-notice -j 8 mawk -v x=4 \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
+wc -l uniqCperindv
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+for ((i = 2; i <= 10; i++));
+do
+echo $i >> ufile
+done
+
+cat ufile | parallel --no-notice "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniqCperindv | wc -l" | mawk  '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.peri.data
+rm ufile
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+gnuplot << \EOF 
+set terminal dumb size 80, 30
+set autoscale 
+unset label
+set title "Number of Unique Sequences present in more than X Individuals"
+set xlabel "Number of Individuals"
+set ylabel "Number of Unique Sequences"
+plot 'uniqseq.peri.data' with lines notitle
+pause -1
+EOF
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+mawk -v x=4 '$1 >= x' uniqCperindv > uniq.k.4.c.4.seqs
+wc -l uniq.k.4.c.4.seqs
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cut -f2 uniq.k.4.c.4.seqs > totaluniqseq
+mawk '{c= c + 1; print ">Contig_" c "\n" $1}' totaluniqseq > uniq.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f1 > uniq.F.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cd-hit-est -i uniq.F.fasta -o xxx -c 0.8 -T 0 -M 0 -g 1
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed 's/[>Contig_,...]//g' | sort -g -k1 > sort.contig.cluster.ids
+paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
+sort -k2,2 -g contig.cluster.totaluniqseq | sed -e 's/NNNNNNNNNN/\t/g' > rcluster
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+head rcluster
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cut -f2 rcluster | uniq | wc -l
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rainbow div -i rcluster -o rbdiv.out
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rainbow merge -o rbasm.out -a -i rbdiv.out
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+rainbow merge -o rbasm.out -a -i rbdiv.out -r 2
+{ set +x; } 2>/dev/null
+pause
+
+# If missing fdesc mount for bash, <(echo "E") will not work
+# cat rbasm.out <(echo "E") |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
+set -x
+echo "E" > endfile
+cat rbasm.out endfile |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
+if (NR == 1) e=$2;
+else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
+else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
+else if ($1 ~/C/) clus=$2;
+else if ($1 ~/L/) len=$2;
+else if ($1 ~/S/) seq=$2;
+else if ($1 ~/N/) freq=$2;
+else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
+else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
+else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
+}' > rainbow.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+cd-hit-est -i rainbow.fasta -o referenceRC.fasta -M 0 -T 0 -c 0.9
+{ set +x; } 2>/dev/null
+pause
+
+remake_reference.sh 4 4 0.90 PE 2
+{ set +x; } 2>/dev/null
+pause
+
+ReferenceOpt.sh
+
+bash ReferenceOpt.sh 4 8 4 8 PE 16
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+bash remake_reference.sh 4 4 0.90 PE 2
+head reference.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+wc -l reference.fasta
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+mawk '/>/' reference.fasta | wc -l
+{ set +x; } 2>/dev/null
+pause
+
+set -x
+mawk '/^NAATT.*N*.*CCGN$/' reference.fasta | wc -l
+grep '^NAATT.*N*.*CCGN$' reference.fasta | wc -l
+{ set +x; } 2>/dev/null
+pause
+
+printf "Bonus Section: Optimize reference assemblies? (takes a long time) y/[n] "
+read bonus
+if [ 0$bonus = 0y ]; then
+    set -x
+    { set +x; } 2>/dev/null
+    printf "Running dDocent to trim reads.\n"
+    pause
+    set -x
+    cat << EOM | dDocent || true
+yes
+8
+10g
+yes
+no
+no
+no
+bacon%uwm.edu@localhost
+EOM
+    RefMapOpt.sh 4 8 4 8 0.9 64 PE
+    { set +x; } 2>/dev/null
+    pause
+    more mapping.results
+    pause
+fi
diff --git a/ddocent/files/ddocent-assembly-test-cleanup b/ddocent/files/ddocent-assembly-test-cleanup
new file mode 100644
index 0000000000..4189b3bb03
--- /dev/null
+++ b/ddocent/files/ddocent-assembly-test-cleanup
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+
+if pwd | fgrep dDocent-test; then
+    rm -rf .*.bak Data ddocent-bin
+else
+    printf "$0 can only be run in a dDocent-test directory.\n"
+    exit 1
+fi
diff --git a/ddocent/files/ddocent-test b/ddocent/files/ddocent-test
deleted file mode 100644
index 3aa6ab5d3a..0000000000
--- a/ddocent/files/ddocent-test
+++ /dev/null
@@ -1,227 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-##########################################################################
-#   Function description:
-#       Pause until user presses return
-##########################################################################
-
-pause()
-{
-    set +x
-    local junk
-    
-    printf "Press return to continue..."
-    read junk
-    set -x
-}
-
-
-##########################################################################
-#   Not necessary if invoking scripts with "bash script-name" as shown
-#   in the dDocent tutorial, but supplied for completeness.  Tutorial
-#   scripts have also been patched upstream to usr /usr/bin/env, but
-#   we won't assume they'll all stay that way.
-##########################################################################
-
-fix_bash_path()
-{
-    # Don't echo these commands
-    set +x
-    if [ $# != 1 ]; then
-	printf "Usage: $0 script\n"
-	exit 1
-    fi
-    sed -i.bak -e "s|#!/bin/bash|#!/usr/bin/env bash|g" $1
-    set -x
-}
-
-
-##########################################################################
-#   Main
-##########################################################################
-
-##########################################################################
-#   Workarounds for running dDocent from FreeBSD ports and Pkgsrc.
-#   Include this section in any scripts running dDocent commands.
-##########################################################################
-
-# Hack to allow remake_reference.sh, etc. to find trimmomatic.
-# trimmomatic.jar and adapter files are not an executables and should not
-# be in PATH according to filesystem hierarchy standards, but the dDocent
-# scripts are coded to look for them there, because that's how dDocent
-# installs the bundled Trimmomatic.
-if [ -e /usr/local/share/java/classes/trimmomatic.jar ]; then
-    export PATH=${PATH}:/usr/local/share/java/classes:/usr/local/share/trimmomatic/adapters
-elif [ -e $PKGSRC/lib/java/trimmomatic.jar ]; then
-    export PATH=${PATH}:$PKGSRC/lib/java:$PKGSRC/share/trimmomatic/adapters
-else
-    printf "Error: Trimmomatic is not installed in a known location.\n" >> /dev/stderr
-fi
-
-if ! pwd | fgrep dDocent-test; then
-    mkdir -p dDocent-test
-    cd dDocent-test
-fi
-
-# remake_reference.sh and some other scripts use GNU extensions in awk
-# commands.  Trick systems into using gawk to get around this without
-# having to patch every script.
-mkdir -p ddocent-bin
-ln -sf `which gawk` ddocent-bin/awk
-ln -sf `which python2.7` ddocent-bin/python
-export PATH=`pwd`/ddocent-bin:$PATH
-
-##########################################################################
-#   Actual dDocent commands below
-##########################################################################
-
-##########################################################################
-#   Command sequence from the dDocent tutorial on Github.  See link below.
-##########################################################################
-
-cat << EOM
-
-This script runs the test commands described at
-
-https://github.com/jpuritz/dDocent/blob/master/tutorials/Reference%20Assembly%20Tutorial.md
-
-You may want to follow along on this web page as the script runs.  It will
-pause after each step to allow checking the output.
-
-EOM
-
-mkdir -p Data
-cd Data
-
-if [ ! -e data.zip ]; then
-    curl --insecure -L -o data.zip \
-	'https://www.dropbox.com/s/t09xjuudev4de72/data.zip?dl=0'
-fi
-
-# Hack: current data.zip contains data.zip.  ??!
-if [ ! -e simRRLs2.py ]; then
-    yes | unzip data.zip || true
-fi
-
-head SimRAD.barcodes
-
-cut -f2 SimRAD.barcodes > barcodes
-head barcodes
-
-process_radtags -1 SimRAD_R1.fastq.gz -2 SimRAD_R2.fastq.gz -b barcodes \
-    -e ecoRI --renz_2 mspI -r -i gzfastq
-ls
-
-rm *rem*
-if [ ! -e Rename_for_dDocent.sh ]; then
-    curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/Rename_for_dDocent.sh
-    fix_bash_path Rename_for_dDocent.sh
-fi
-
-bash Rename_for_dDocent.sh SimRAD.barcodes
-ls *.fq.gz
-
-ls *.F.fq.gz > namelist
-sed -i'' -e 's/.F.fq.gz//g' namelist
-AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
-AWK2='!/>/'
-AWK3='!/NNN/'
-PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
-
-cat namelist | parallel --no-notice -j 8 "zcat {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
-cat namelist | parallel --no-notice -j 8 "zcat {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
-cat namelist | parallel --no-notice -j 8 "paste -d '-' {}.forward {}.reverse | mawk '$AWK3' | sed 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
-
-cat *.uniq.seqs > uniq.seqs
-for i in {2..20};
-do 
-echo $i >> pfile
-done
-cat pfile | parallel --no-notice "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniq.seqs | wc -l" | mawk  '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.data
-rm pfile
-cat uniqseq.data
-
-gnuplot << \EOF 
-set terminal dumb size 120, 30
-set autoscale
-set xrange [2:20] 
-unset label
-set title "Number of Unique Sequences with More than X Coverage (Counted within individuals)"
-set xlabel "Coverage"
-set ylabel "Number of Unique Sequences"
-plot 'uniqseq.data' with lines notitle
-pause -1
-EOF
-
-parallel --no-notice -j 8 mawk -v x=4 \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
-wc -l uniqCperindv
-
-for ((i = 2; i <= 10; i++));
-do
-echo $i >> ufile
-done
-
-cat ufile | parallel --no-notice "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniqCperindv | wc -l" | mawk  '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.peri.data
-rm ufile
-gnuplot << \EOF 
-set terminal dumb size 120, 30
-set autoscale 
-unset label
-set title "Number of Unique Sequences present in more than X Individuals"
-set xlabel "Number of Individuals"
-set ylabel "Number of Unique Sequences"
-plot 'uniqseq.peri.data' with lines notitle
-pause -1
-EOF
-
-mawk -v x=4 '$1 >= x' uniqCperindv > uniq.k.4.c.4.seqs
-wc -l uniq.k.4.c.4.seqs
-
-cut -f2 uniq.k.4.c.4.seqs > totaluniqseq
-mawk '{c= c + 1; print ">Contig_" c "\n" $1}' totaluniqseq > uniq.fasta
-
-sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f1 > uniq.F.fasta
-cd-hit-est -i uniq.F.fasta -o xxx -c 0.8 -T 0 -M 0 -g 1
-
-mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed 's/[>Contig_,...]//g' | sort -g -k1 > sort.contig.cluster.ids
-paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
-sort -k2,2 -g contig.cluster.totaluniqseq | sed -e 's/NNNNNNNNNN/\t/g' > rcluster
-
-cut -f2 rcluster | uniq | wc -l
-
-rainbow div -i rcluster -o rbdiv.out
-rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
-rainbow merge -o rbasm.out -a -i rbdiv.out
-rainbow merge -o rbasm.out -a -i rbdiv.out -r 2
-
-# If missing fdesc mount for bash, <(echo "E") will not work
-# cat rbasm.out <(echo "E") |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
-echo "E" > endfile
-cat rbasm.out endfile |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
-if (NR == 1) e=$2;
-else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
-else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
-else if ($1 ~/C/) clus=$2;
-else if ($1 ~/L/) len=$2;
-else if ($1 ~/S/) seq=$2;
-else if ($1 ~/N/) freq=$2;
-else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
-else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
-else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
-}' > rainbow.fasta
-
-cd-hit-est -i rainbow.fasta -o referenceRC.fasta -M 0 -T 0 -c 0.9
-
-if [ ! -e remake_reference.sh ]; then
-    curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/scripts/remake_reference.sh
-    fix_bash_path remake_reference.sh
-fi
-bash remake_reference.sh 4 4 0.90 PE 2
-
-if [ ! -e ReferenceOpt.sh ]; then
-    curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/scripts/ReferenceOpt.sh
-    fix_bash_path ReferenceOpt.sh
-fi
-bash ReferenceOpt.sh 4 8 4 8 PE 16


Home | Main Index | Thread Index | Old Index