diff options
author | Pierre Cazenave <pwcazenave at gmail {dot} com> | 2010-07-30 02:46:02 -0500 |
---|---|---|
committer | Erik Hanson <erik@slackbuilds.org> | 2010-07-31 22:31:51 -0500 |
commit | e502945912c3ccc6d55a6819bc921cf5f47cc4fd (patch) | |
tree | 5c008c3bf671002bdedefa3f8a7fde59d88bb113 /graphics | |
parent | 43cc5518b42cbafb28111cfa607ce8a50e64bb6a (diff) |
graphics/ocropus: Added (document analysis and OCR system)
Signed-off-by: Robby Workman <rworkman@slackbuilds.org>
Diffstat (limited to 'graphics')
-rw-r--r-- | graphics/ocropus/README | 9 | ||||
-rw-r--r-- | graphics/ocropus/ocrodata-env.diff | 15 | ||||
-rw-r--r-- | graphics/ocropus/ocropus.SlackBuild | 109 | ||||
-rw-r--r-- | graphics/ocropus/ocropus.info | 10 | ||||
-rw-r--r-- | graphics/ocropus/ocroscript.1 | 43 | ||||
-rw-r--r-- | graphics/ocropus/slack-desc | 19 | ||||
-rw-r--r-- | graphics/ocropus/usr-local.diff | 22 |
7 files changed, 227 insertions, 0 deletions
diff --git a/graphics/ocropus/README b/graphics/ocropus/README new file mode 100644 index 000000000000..804acf85ff04 --- /dev/null +++ b/graphics/ocropus/README @@ -0,0 +1,9 @@ +OCRopus is a state-of-the-art document analysis and OCR system, featuring +pluggable layout analysis, pluggable character recognition, statistical +natural language modeling, and multi-lingual capabilities. + +The system is being developed with the generous support from Google and +other organizations; the primary developers are at the IUPR Research +Group at the DFKI Research Center. + +This requires tesseract and iulib. diff --git a/graphics/ocropus/ocrodata-env.diff b/graphics/ocropus/ocrodata-env.diff new file mode 100644 index 000000000000..04cfd5d9af84 --- /dev/null +++ b/graphics/ocropus/ocrodata-env.diff @@ -0,0 +1,15 @@ +Description: Respect the OCRODATA environment variable for all lua scripts. +Author: Jakub Wilk <jwilk@debian.org> + +Index: ocropus-0.3.1/ocroscript/ocrotoplevel.cc +=================================================================== +--- ocropus-0.3.1.orig/ocroscript/ocrotoplevel.cc 2009-11-26 18:47:54.000000000 +0100 ++++ ocropus-0.3.1/ocroscript/ocrotoplevel.cc 2009-11-26 18:47:54.000000000 +0100 +@@ -471,6 +471,7 @@ + lua_call(L, 0, 0); + + // handle OCRODATA environment variable as a directory ++ if(getenv("OCRODATA")) ocroscripts = getenv("OCRODATA"); + lua_pushstring(L, ocrodata); + lua_setglobal(L, "ocrodata"); + diff --git a/graphics/ocropus/ocropus.SlackBuild b/graphics/ocropus/ocropus.SlackBuild new file mode 100644 index 000000000000..e8c2ce60b019 --- /dev/null +++ b/graphics/ocropus/ocropus.SlackBuild @@ -0,0 +1,109 @@ +#!/bin/sh + +# Slackware build script for OCROpus. + +# Copyright 2010 Pierre Cazenave <pwcazenave {at} gmail [dot] com> +# All rights reserved. +# +# Redistribution and use of this script, with or without modification, is +# permitted provided that the following conditions are met: +# +# 1. Redistributions of this script must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +PRGNAM=ocropus +VERSION=${VERSION:-0.3.1} +BUILD=${BUILD:-1} +TAG=${TAG:-_SBo} + +DIRVER=${DIRVER:-0.3} + +if [ -z "$ARCH" ]; then + case "$( uname -m )" in + i?86) ARCH=i486 ;; + arm*) ARCH=arm ;; + *) ARCH=$( uname -m ) ;; + esac +fi + +CWD=$(pwd) +TMP=${TMP:-/tmp/SBo} +PKG=$TMP/package-$PRGNAM +OUTPUT=${OUTPUT:-/tmp} + +if [ "$ARCH" = "i486" ]; then + SLKCFLAGS="-O2 -march=i486 -mtune=i686" + LIBDIRSUFFIX="" +elif [ "$ARCH" = "i686" ]; then + SLKCFLAGS="-O2 -march=i686 -mtune=i686" + LIBDIRSUFFIX="" +elif [ "$ARCH" = "x86_64" ]; then + SLKCFLAGS="-O2 -fPIC" + LIBDIRSUFFIX="64" +else + SLKCFLAGS="-O2" + LIBDIRSUFFIX="" +fi + +set -e + +rm -rf $PKG +mkdir -p $TMP $PKG $OUTPUT +cd $TMP +rm -rf $PRGNAM-$DIRVER +tar xvf $CWD/$PRGNAM-$VERSION.tar.gz +cd $PRGNAM-$DIRVER +chown -R root:root . +chmod -R u+w,go+r-w,a-s . + +# Debian patch to fix hardcoded /usr/local paths in some source files +patch -p1 < $CWD/usr-local.diff +# Debian patch to fix behaviour of the OCRODATA environment variable +patch -p1 < $CWD/ocrodata-env.diff + +CFLAGS="$SLKCFLAGS" \ +CXXFLAGS="$SLKCFLAGS" \ +./configure \ + --prefix=/usr \ + --sysconfdir=/etc \ + --localstatedir=/var \ + --libdir=/usr/lib${LIBDIRSUFFIX} \ + --mandir=/usr/man \ + --docdir=/usr/doc/$PRGNAM-$VERSION \ + --with-tesseract=/usr \ + --with-iulib=/usr \ + --without-fst \ + --without-SDL \ + --without-leptonica \ + --build=$ARCH-slackware-linux + +make +make install DESTDIR=$PKG + +find $PKG | xargs file | grep -e "executable" -e "shared object" | grep ELF \ + | cut -f 1 -d : | xargs strip --strip-unneeded 2> /dev/null || true + +# Add Debian's manpage +mkdir -p $PKG/usr/man/man1 +gzip -9c $CWD/ocroscript.1 > $PKG/usr/man/man1/ocroscript.1.gz + +mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION +cp -a CHANGES COPYING DIRS INSTALL README $PKG/usr/doc/$PRGNAM-$VERSION +cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM.SlackBuild + +mkdir -p $PKG/install +cat $CWD/slack-desc > $PKG/install/slack-desc + +cd $PKG +/sbin/makepkg -l y -c n $OUTPUT/$PRGNAM-$VERSION-$ARCH-$BUILD$TAG.${PKGTYPE:-tgz} diff --git a/graphics/ocropus/ocropus.info b/graphics/ocropus/ocropus.info new file mode 100644 index 000000000000..a38b5bc68066 --- /dev/null +++ b/graphics/ocropus/ocropus.info @@ -0,0 +1,10 @@ +PRGNAM="ocropus" +VERSION="0.3.1" +HOMEPAGE="http://sites.google.com/site/ocropus/" +DOWNLOAD="http://ocropus.googlecode.com/files/ocropus-0.3.1.tar.gz" +MD5SUM="2a1b66419ae69ef031d5e6269db15bb5" +DOWNLOAD_x86_64="" +MD5SUM_x86_64="" +MAINTAINER="Pierre Cazenave" +EMAIL="pwcazenave < at > gmail {dot} com" +APPROVED="rworkman" diff --git a/graphics/ocropus/ocroscript.1 b/graphics/ocropus/ocroscript.1 new file mode 100644 index 000000000000..d8087203f732 --- /dev/null +++ b/graphics/ocropus/ocroscript.1 @@ -0,0 +1,43 @@ +.TH ocroscript 1 "June 06, 2008" +.SH NAME +ocropus \- command line OCR tool +.SH SYNOPSIS +.B ocroscript +.RI "<script> <arguments>" +.SH DESCRIPTION +You can see a list of all available commands by looking in the $OCROSCRIPTS +(/usr/share/ocropus/scripts/ by default) path. +.PP +The \(oqrecognize\(cq script uses tesseract for recognition and sends the html-based hOCR +ouput to stdout. Tesseract is probably the most mature text recognizer within +OCRopus at the moment. Natively, Tesseract doesn't do layout analysis, but +combined with OCRopus, it makes for a pretty good OCR system: +.RS +$ ocroscript recognize page.png > page.html +.RE +.PP +Here is a brief summary of the remaining command line commands available. +You will need to look at the script to see what the command line arguments are: +.TP +degrade.lua +Simple document image degradation +.TP +hocr-to-text.lua +Convert hOCR output to plain text. +.TP +line-clean.lua +Given a line image, remove marginal noise and fix some other problems. +.TP +sauvola.lua +Perform Sauvola thresholding. +.SH SEE ALSO +.BR tesseract (1), +.br +.PP +.UR http://code.google.com/p/ocropus/w/list +.UE +.SH AUTHOR +ocroscript was written by Thomas Breuel. +.PP +This manual page was written by Jeffrey Ratcliffe <Jeffrey.Ratcliffe@gmail.com>, +for the Debian project (but may be used by others). diff --git a/graphics/ocropus/slack-desc b/graphics/ocropus/slack-desc new file mode 100644 index 000000000000..00aef62c4448 --- /dev/null +++ b/graphics/ocropus/slack-desc @@ -0,0 +1,19 @@ +# HOW TO EDIT THIS FILE: +# The "handy ruler" below makes it easier to edit a package description. Line +# up the first '|' above the ':' following the base package name, and the '|' +# on the right side marks the last column you can put a character in. You must +# make exactly 11 lines for the formatting to be correct. It's also +# customary to leave one space after the ':'. + + |-----handy-ruler------------------------------------------------------| +ocropus: OCRopus (document analysis and OCR system) +ocropus: +ocropus: OCRopus(tm) is a state-of-the-art document analysis and OCR system +ocropus: featuring pluggable layout analysis, pluggable character recognition, +ocropus: statistical natural language modeling, and multi-lingual capabilities. +ocropus: +ocropus: The system is being developed with the generous support from Google +ocropus: and other organizations; the primary developers are at the IUPR +ocropus: Research Group at the DFKI Research Center. +ocropus: +ocropus: http://sites.google.com/site/ocropus/ diff --git a/graphics/ocropus/usr-local.diff b/graphics/ocropus/usr-local.diff new file mode 100644 index 000000000000..0a17478e45e4 --- /dev/null +++ b/graphics/ocropus/usr-local.diff @@ -0,0 +1,22 @@ +Description: + Use /usr/share/ocropus/scripts/ and /usr/share/ocropus/ as defaults for + OCROSCRIPTS and OCRODATA. +Author: Jakub Wilk <jwilk@debian.org> + +Index: ocropus-0.3.1/ocroscript/ocrotoplevel.cc +=================================================================== +--- ocropus-0.3.1.orig/ocroscript/ocrotoplevel.cc 2009-11-26 16:56:18.000000000 +0100 ++++ ocropus-0.3.1/ocroscript/ocrotoplevel.cc 2009-11-26 17:16:32.000000000 +0100 +@@ -68,10 +68,10 @@ + + // FIXME the Jamfile isn't passing this flag, so for now, this is a workaround + #ifndef OCROSCRIPTS +-#define OCROSCRIPTS "/usr/local/share/ocropus/scripts/" ++#define OCROSCRIPTS "/usr/share/ocropus/scripts/" + #endif + #ifndef OCRODATA +-#define OCRODATA "/usr/local/share/ocropus/" ++#define OCRODATA "/usr/share/ocropus/" + #endif + + const char *ocroscripts = OCROSCRIPTS; |