meta-openembedded/contrib/tesseract-langs.sh
Mario Domenech Goulart cb41796a5e contrib/tesseract-langs.sh: add script to generate recipes for tesseract languages
This script writes language recipes for tesseract.  It downloads the
listing of available languages and language tarballs from the official
site and writes language recipes tesseract-lang-<lang>_<version>.bb
for each language.

Signed-off-by: Mario Domenech Goulart <mario@ossystems.com.br>
Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
2014-06-10 12:16:23 +02:00

93 lines
2.1 KiB
Bash
Executable File

#! /bin/sh
# Copyright (C) 2014, O.S. Systems Software Ltda. All Rights Reserved
# Released under the MIT license (see meta-openembedded layer's COPYING.MIT)
PV='3.02'
# Sometimes the software package has a minor version, but language
# packages have not. Example:
# software package: tesseract-ocr-3.02.02.tar.gz
# language package: tesseract-ocr-3.02.por.tar.gz
MINOR_PV=02
recipes_dir=$1
usage() {
echo "Usage: `basename $0` <recipes dir> [ <download dir> ]"
}
if [ -z "$recipes_dir" ]; then
usage
exit 1
fi
mkdir -p "$recipes_dir"
file_list_uri='https://code.google.com/p/tesseract-ocr/downloads/list'
file_list=`mktemp`
remove_dl_dir=
if [ -z "$2" ]; then
remove_dl_dir=1
dl_dir=`mktemp -d`
else
dl_dir="$2"
fi
mkdir -p $dl_dir
tesseract_langs() {
wget -q -O "$file_list" "$file_list_uri"
grep -E 'a href="detail\?name=tesseract-ocr-'${PV}'\.[^\.]+.tar.gz&amp;can=2&amp;q=">' "$file_list" | \
sed -r -e 's/.*tesseract-ocr-'${PV}'\.*([^\.]+)\.tar\.gz.*/\1/' | \
grep -Ev '('${MINOR_PV}'|'${MINOR_PV}'-doc-html)' | \
sort -u
}
download_lang_files() {
local langs="$1"
local uri
for lang in $langs; do
if [ ! -e "$dl_dir/tesseract-ocr-${PV}.${lang}.tar.gz" ]; then
uri="https://tesseract-ocr.googlecode.com/files/tesseract-ocr-${PV}.${lang}.tar.gz"
echo "Downloading $uri"
wget -q -P "$dl_dir" "$uri"
fi
done
}
create_recipe() {
local lang=$1
local tarball
tarball="$dl_dir/tesseract-ocr-${PV}.${lang}.tar.gz"
md5sum=`md5sum $tarball | awk '{print $1}'`
sha256sum=`sha256sum $tarball | awk '{print $1}'`
cat > $recipes_dir/tesseract-lang-`echo ${lang} | sed s/_/-/g`_${PV}.bb <<EOF
# Copyright (C) 2014, O.S. Systems Software Ltda. All Rights Reserved
# Released under the MIT license (see meta-openembedded layer's COPYING.MIT)
TESSERACT_LANG = "$lang"
require tesseract-lang.inc
SRC_URI[md5sum] = "${md5sum}"
SRC_URI[sha256sum] = "${sha256sum}"
EOF
}
LANGS=`tesseract_langs`
download_lang_files "$LANGS"
for lang in $LANGS; do
create_recipe $lang
done
[ -n "$remove_dl_dir" ] && rm -rf $dl_dir
rm -f $file_list