aboutsummaryrefslogtreecommitdiff
path: root/academic/spidey
diff options
context:
space:
mode:
Diffstat (limited to 'academic/spidey')
-rw-r--r--academic/spidey/README22
-rw-r--r--academic/spidey/slack-desc6
-rw-r--r--academic/spidey/spidey.1279
-rw-r--r--academic/spidey/spidey.SlackBuild61
-rw-r--r--academic/spidey/spidey.info6
5 files changed, 349 insertions, 25 deletions
diff --git a/academic/spidey/README b/academic/spidey/README
index 0912181c252be..6e7c198210657 100644
--- a/academic/spidey/README
+++ b/academic/spidey/README
@@ -1,9 +1,15 @@
-Spidey is an mRNA-to-genomic alignment program. For a complete
-description of how Spidey works, visit
-http://www.ncbi.nlm.nih.gov/spidey/spideydoc.html.
+Spidey: an mRNA-to-genomic alignment program.
-This is just repackaging of the ready binary for x86 and will not run
-on x86_64. It will probably work just fine on a Slackware multilib
-box, but we do not support that ;). If you want to build spidey from
-source, you should download and compile the NCBI toolkit. For more
-information: http://www.ncbi.nlm.nih.gov/spidey/spideysource.html
+Spidey is a tool for aligning one or more mRNA sequences to a given
+genomic sequence. It was written with two main goals in mind:
+1) find good alignments regardless of intron size
+2) avoid getting confused by nearby pseudogenes and paralogs.
+
+The following programs provide a GUI to run spidey:
+-ugene
+-perlprimer
+
+This is just repackaging of precompiled binaries:
+- x86 platform: the executable is provided by upstream (NCBI).
+- x86_64 platform: the executable is kindly provided by the UniPro
+Ugene project, where it is part of their External Tools meta-package.
diff --git a/academic/spidey/slack-desc b/academic/spidey/slack-desc
index cd570e41334a2..3689502f746f4 100644
--- a/academic/spidey/slack-desc
+++ b/academic/spidey/slack-desc
@@ -8,12 +8,12 @@
|-----handy-ruler------------------------------------------------------|
spidey: spidey (mRNA-to-genomic alignment)
spidey:
-spidey: Spidey is an mRNA-to-genomic alignment program.
-spidey:
+spidey: Spidey is a tool for aligning one or more mRNA sequences
+spidey: to a given genomic sequence.
spidey:
+spidey: Home: http://www.ncbi.nlm.nih.gov/spidey/index.html
spidey:
spidey:
spidey:
spidey:
spidey:
-spidey: Home: http://www.ncbi.nlm.nih.gov/spidey/index.html
diff --git a/academic/spidey/spidey.1 b/academic/spidey/spidey.1
new file mode 100644
index 0000000000000..f3e2836f95e52
--- /dev/null
+++ b/academic/spidey/spidey.1
@@ -0,0 +1,279 @@
+.TH SPIDEY 1 2005-01-25 NCBI "NCBI Tools User's Manual"
+.SH NAME
+spidey \- align mRNA sequences to a genome
+.SH SYNOPSIS
+.B spidey
+[\|\fB\-\fP\|]
+[\|\fB\-F\fP\ \fIN\fP\|]
+[\|\fB\-G\fP\|]
+[\|\fB\-L\fP\ \fIN\fP\|]
+[\|\fB\-M\fP\ \fIfilename\fP\|]
+[\|\fB\-N\fP\ \fIfilename\fP\|]
+[\|\fB\-R\fP\ \fIfilename\fP\|]
+[\|\fB\-S\fP\ \fIp/m\fP\|]
+[\|\fB\-T\fP\ \fIN\fP\|]
+[\|\fB\-X\fP\|]
+[\|\fB\-a\fP\ \fIfilename\fP\|]
+[\|\fB\-c\fP\ \fIN\fP\|]
+[\|\fB\-d\fP\|]
+[\|\fB\-e\fP\ \fIX\fP\|]
+[\|\fB\-f\fP\ \fIX\fP\|]
+[\|\fB\-g\fP\ \fIX\fP\|]
+\fB\-i\fP\ \fIfilename\fP
+[\|\fB\-j\fP\|]
+[\|\fB\-k\fP\ \fIfilename\fP\|]
+[\|\fB\-l\fP\ \fIN\fP\|]
+\fB\-m\fP\ \fIfilename\fP
+[\|\fB\-n\fP\ \fIN\fP\|]
+[\|\fB\-o\fP\ \fIstr\fP\|]
+[\|\fB\-p\fP\ \fIN\fP\|]
+[\|\fB\-r\fP\ \fIc/d/m/p/v\fP\|]
+[\|\fB\-s\fP\|]
+[\|\fB\-t\fP\ \fIfilename\fP\|]
+[\|\fB\-u\fP\|]
+[\|\fB\-w\fP\|]
+.SH DESCRIPTION
+\fBspidey\fP is a tool for aligning one or more mRNA sequences to a
+given genomic sequence. \fBspidey\fP was written with two main goals
+in mind: find good alignments regardless of intron size; and avoid
+getting confused by nearby pseudogenes and paralogs. Towards the
+first goal, \fBspidey\fP uses BLAST and Dot View (another local
+alignment tool) to find its alignments; since these are both local
+alignment tools, \fBspidey\fP does not intrinsically favor shorter or
+longer introns and has no maximum intron size. To avoid mistakenly
+including exons from paralogs and pseudogenes, \fBspidey\fP first
+defines windows on the genomic sequence and then performs the
+mRNA-to-genomic alignment separately within each window. Because of
+the way the windows are constructed, neighboring paralogs or
+pseudogenes should be in separate windows and should not be included
+in the final spliced alignment.
+.SS Initial alignments and construction of genomic windows
+\fBspidey\fP takes as input a single genomic sequence and a set of
+mRNA accessions or FASTA sequences. All processing is done one mRNA
+sequence at a time. The first step for each mRNA sequence is a
+high-stringency BLAST against the genomic sequence. The resulting
+hits are analyzed to find the genomic windows.
+.PP
+The BLAST alignments are sorted by score and then assigned into
+windows by a recursive function which takes the first alignment and
+then goes down the alignment list to find all alignments that are
+consistent with the first (same strand of mRNA, both the mRNA and
+genomic coordinates are nonoverlapping and linearly consistent). On
+subsequent passes, the remaining alignments are examined and are put
+into their own nonoverlapping, consistent windows, until no alignments
+are left. Depending on how many gene models are desired, the
+top \fIn\fP windows are chosen to go on to the next step and the others
+are deleted.
+.SS Aligning in each window
+Once the genomic windows are constructed, the initial BLAST alignments
+are freed and another BLAST search is performed, this time with the
+entire mRNA against the genomic region defined by the window, and at a
+lower stringency than the initial search. \fBspidey\fP then uses a
+greedy algorithm to generate a high-scoring, nonoverlapping subset of
+the alignments from the second BLAST search. This consistent set is
+analyzed carefully to make sure that the entire mRNA sequence is
+covered by the alignments. When gaps are found between the
+alignments, the appropriate region of genomic sequence is searched
+against the missing mRNA, first using a very low-stringency BLAST and,
+if the BLAST fails to find a hit, using DotView functions to locate
+the alignment. When gaps are found at the ends of the alignments, the
+BLAST and DotView searches are actually allowed to extend past the
+boundaries of the window. If the 3' end of the mRNA does not align
+completely, it is first examined for the presence of a poly(A) tail.
+No attempt is made to align the portion of the mRNA that seems to be a
+poly(A) tail; sometimes there is a poly(A) tail that does align to the
+genomic sequence, and these are noted because they indicate the
+possibility of a pseudogene.
+.PP
+Now that the mRNA is completely covered by the set of alignments, the
+boundaries of the alignments (there should be one alignment per exon
+now) are adjusted so that the alignments abut each other precisely and
+so that they are adjacent to good splice donor and acceptor sites.
+Most commonly, two adjacent exons' alignments overlap by as much as 20
+or 30 base pairs on the mRNA sequence. The true exon boundary may lie
+anywhere within this overlap, or (as we have seen empirically) even a
+few base pairs outside the overlap. To position the exon boundaries,
+the overlap plus a few base pairs on each side is examined for splice
+donor sites, using functions that have different splice matrices
+depending on the organism chosen. The top few splice donor sites (by
+score) are then evaluated as to how much they affect the original
+alignment boundaries. The site that affects the boundaries the least
+is chosen, and is evaluated as to the presence of an acceptor site.
+The alignments are truncated or extended as necessary so that they
+terminate at the splice donor site and so that they do not overlap.
+.SS Final result
+The windows are examined carefully to get the percent identity per
+exon, the number of gaps per exon, the overall percent identity, the
+percent coverage of the mRNA, presence of an aligning or non-aligning
+poly(A) tail, number of splice donor sites and the presence or absence
+of splice donor and acceptor sites for each exon, and the occurrence
+of an mRNA that has a 5' or 3' end (or both) that does not align to
+the genomic sequence. If the overall percent identity and percent
+length coverage are above the user-defined cutoffs, a summary report
+is printed, and, if requested, a text alignment showing identities and
+mismatches is also printed.
+.SS Interspecies alignments
+\fBspidey\fP is capable of performing interspecies alignments. The
+major difference in interspecies alignments is that the mRNA-genomic
+identity will not be close to 100% as it is in intraspecies
+alignments; also, the alignments have numerous and lengthy gaps. If
+\fBspidey\fP is used in its normal mode to do interspecies alignments,
+it produces gene models with many, many short exons. When the
+interspecies flag is set, \fBspidey\fP uses different BLAST parameters
+to encourage longer and more gaps and to not penalize as heavily for
+mismatches. This way, the alignments for the exons are much longer
+and more closely approximate the actual gene structure.
+.SS Extracting CDS alignments
+When \fBspidey\fP is run in network-aware mode or when ASN.1 files are
+used for the mRNA records, it is capable of extracting a CDS alignment
+from an mRNA alignment and printing the CDS information also. Since
+the CDS alignment is just a subset of the mRNA alignment, it is
+relatively straightforward to truncate the exon alignments as
+necessary and to generate a CDS alignment. Furthermore, the
+untranslated regions are now defined, so the percent identity for the
+5' and 3' untranslated regions is also calculated.
+.PP
+.SH OPTIONS
+A summary of options is included below.
+.TP
+\fB\-\fP
+Print usage message.
+.TP
+\fB\-F\fP\ \fIN\fP
+Start of genomic interval desired (from; 0-based).
+.TP
+\fB\-G\fP
+Input file is a GI list.
+.TP
+\fB\-L\fP\ \fIN\fP
+The extra-large intron size to use (default = 220000).
+.TP
+\fB\-M\fP\ \fIfilename\fP
+File with donor splice matrix.
+.TP
+\fB\-N\fP\ \fIfilename\fP
+File with acceptor splice matrix.
+.TP
+\fB\-R\fP\ \fIfilename\fP
+File (including path) to repeat blast database for filtering.
+.TP
+\fB\-S\fP\ \fIp/m\fP
+Restrict to plus (p) or minus (m) strand of genomic sequence.
+.TP
+\fB\-T\fP\ \fIN\fP
+Stop of genomic interval desired (to; 0-based).
+.TP
+\fB\-X\fP
+Use extra-large intron sizes (increases the limit for initial and
+terminal introns from 100kb to 240kb and for all others from 35kb to
+120kb); may result in significantly longer compute times.
+.TP
+\fB\-a\fP\ \fIfilename\fP
+Output file for alignments when directed to a separate file with
+\fB-p\ 3\fP (default = spidey.aln).
+.TP
+\fB\-c\fP\ \fIN\fP
+Identity cutoff, in percent, for quality control purposes.
+.TP
+\fB\-d\fP
+Also try to align coding sequences corresponding to the given mRNA
+records (may require network access).
+.TP
+\fB\-e\fP\ \fIX\fP
+First-pass e-value (default = 1.0e-10). Higher values increase speed
+at the cost of sensitivity.
+.TP
+\fB\-f\fP\ \fIX\fP
+Second-pass e-value (default = 0.001).
+.TP
+\fB\-g\fP\ \fIX\fP
+Third-pass e-value (default = 10).
+.TP
+\fB\-i\fP\ \fIfilename\fP
+Input file containing the genomic sequence in ASN.1 or FASTA format.
+If your computer is running on a network that can access GenBank, you
+can substitute the desired accession number for the filename.
+.TP
+\fB\-j\fP
+Print ASN.1 alignment?
+.TP
+\fB\-k\fP\ \fIfilename\fP
+File for ASN.1 output with \fB-k\fP (default = spidey.asn).
+.TP
+\fB\-l\fP\ \fIN\fP
+Length coverage cutoff, in percent.
+.TP
+\fB\-m\fP\ \fIfilename\fP
+Input file containing the mRNA sequence(s) in ASN.1 or FASTA format,
+or a list of their accessions (with \fB-G\fP). If your computer is
+running on a network that can access GenBank, you can substitute a
+single accession number for the filename.
+.TP
+\fB\-n\fP\ \fIN\fP
+Number of gene models to return per input mRNA (default = 1).
+.TP
+\fB\-o\fP\ \fIstr\fP
+Main output file (default = stdout; contents controlled by \fB-p\fP).
+.TP
+\fB\-p\fP\ \fIN\fP
+Print alignment?
+.RS
+.PD 0
+.IP \fB0\fP
+summary and alignments together (default)
+.IP \fB1\fP
+just the summary
+.IP \fB2\fP
+just the alignments
+.IP \fB3\fP
+summary and alignments in different files
+.PD
+.RE
+.TP
+\fB\-r\fP\ \fIc/d/m/p/v\fP
+Organism of genomic sequence, used to determine splice matrices.
+.RS
+.PD 0
+.IP \fBc\fP
+C. elegans
+.IP \fBd\fP
+Drosophila
+.IP \fBm\fP
+Dictyostelium discoideum
+.IP \fBp\fP
+plant
+.IP \fBv\fP
+vertebrate (default)
+.PD
+.RE
+.TP
+\fB\-s\fP
+Tune for interspecies alignments.
+.TP
+\fB\-t\fP\ \fIfilename\fP
+File with feature table, in 4 tab-delimited columns:
+.RS
+.PD 0
+.IP \fIseqid\fP
+(e.g., \fBNM_04377.1\fP)
+.IP \fIname\fP
+(only \fBrepetitive_region\fP is currently supported)
+.IP \fIstart\fP
+(0-based)
+.IP \fIstop\fP
+(0-based)
+.PD
+.RE
+.TP
+\fB\-u\fP
+Make a multiple alignment of all input mRNAs (which must overlap on
+the genomic sequence).
+.TP
+\fB\-w\fP
+Consider lowercase characters in input FASTA sequences to be masked.
+.SH AUTHOR
+Sarah Wheelan and others at the National Center for Biotechnology
+Information; Steffen Moeller contributed to this documentation.
+.SH SEE ALSO
+.BR blast (1),
+<http://www.ncbi.nlm.nih.gov/spidey>
diff --git a/academic/spidey/spidey.SlackBuild b/academic/spidey/spidey.SlackBuild
index 443940eb11c72..5a31c4f26affd 100644
--- a/academic/spidey/spidey.SlackBuild
+++ b/academic/spidey/spidey.SlackBuild
@@ -1,25 +1,57 @@
#!/bin/sh
-# Slackware build script for spidey
-# Written by Petar Petrov, <ppetrov@paju.oulu.fi> and
-# hereby submitted to the public domain
-
-# THIS SLACKBUILD IS DISTRIBUTETD IN THE HOPE OF BEING
-# USEFUL BUT WITHOUT ANY WARRANTY. THE AUTHOR IS _NOT_
-# RESPONSIBLE FOR ANY DAMAGE OR DATA LOSS CAUSED BY IT.
+# Copyright 2011-2015 Petar Petrov, petar.petrov@student.oulu.fi
+# All rights reserved.
+#
+# Redistribution and use of this script, with or without modification, is
+# permitted provided that the following conditions are met:
+#
+# 1. Redistributions of this script must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PRGNAM=spidey
-VERSION=${VERSION:-20060601}
-BUILD=${BUILD:-1}
+VERSION=${VERSION:-20060601} # Keep the date of the 32bit binary as version.
+BUILD=${BUILD:-2}
TAG=${TAG:-_SBo}
-ARCH=i386
+if [ -z "$ARCH" ]; then
+ case "$( uname -m )" in
+ i?86) ARCH=i386 ;;
+ arm*) ARCH=arm ;;
+ *) ARCH=$( uname -m ) ;;
+ esac
+fi
CWD=$(pwd)
TMP=${TMP:-/tmp/SBo}
PKG=$TMP/package-$PRGNAM
OUTPUT=${OUTPUT:-/tmp}
+
+if [ "$ARCH" != "i386" ] && [ "$ARCH" != "x86_64" ]; then
+ printf "\n\n$ARCH is not supported... \n"
+ exit 1
+fi
+
+# Determine the source arch. Many thanks to the Ugene project for the
+# 64bit executable!
+if [ "$ARCH" = "x86_64" ]; then
+ SRCARCH=".64"
+else
+ SRCARCH=""
+fi
+
set -e
rm -rf $PKG
@@ -28,13 +60,20 @@ cd $TMP
rm -rf $PRGNAM-$VERSION
mkdir $PRGNAM-$VERSION
cd $PRGNAM-$VERSION
-gunzip -c $CWD/$PRGNAM.linux.gz > spidey
+
+gunzip -c $CWD/$PRGNAM.linux${SRCARCH}.gz > spidey
install -D -m755 spidey $PKG/usr/bin/spidey
+mkdir -p $PKG/usr/man/man1
+cp $CWD/$PRGNAM.1 $PKG/usr/man/man1/$PRGNAM.1
+
find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \
| cut -f 1 -d : | xargs strip --strip-unneeded 2> /dev/null || true
+find $PKG/usr/man -type f -exec gzip -9 {} \;
+for i in $( find $PKG/usr/man -type l ) ; do ln -s $( readlink $i ).gz $i.gz ; rm $i ; done
+
mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION
cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM.SlackBuild
diff --git a/academic/spidey/spidey.info b/academic/spidey/spidey.info
index 324a7da4a9972..3a21a46c7a788 100644
--- a/academic/spidey/spidey.info
+++ b/academic/spidey/spidey.info
@@ -3,8 +3,8 @@ VERSION="20060601"
HOMEPAGE="http://www.ncbi.nlm.nih.gov/spidey/index.html"
DOWNLOAD="ftp://ftp.ncbi.nih.gov/pub/wheelan/Spidey/spidey.linux.gz"
MD5SUM="2e56ef2e4fcf57eca266fb1b3bb56c7e"
-DOWNLOAD_x86_64="UNSUPPORTED"
-MD5SUM_x86_64="UNSUPPORTED"
+DOWNLOAD_x86_64="http://www.student.oulu.fi/~ppetrov/source/spidey.linux.64.gz"
+MD5SUM_x86_64="79f1f95976346e0d0f5c7f717deac176"
REQUIRES=""
MAINTAINER="Petar Petrov"
-EMAIL="ppetrov@paju.oulu.fi"
+EMAIL="petar.petrov@student.oulu.fi"