#!/bin/sh
#
# $Id: munchlist.X,v 1.70 2015-02-08 00:35:41-08 geoff Exp $
#
# Copyright 1987, 1988, 1989, 1992, 1993, 1999, 2001, 2005, Geoff Kuenning,
# Claremont, CA.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. All modifications to the source code must be clearly marked as
# such. Binary redistributions based on modified source code
# must be clearly marked as modified versions in the documentation
# and/or other materials provided with the distribution.
# 4. The code that causes the 'ispell -v' command to display a prominent
# link to the official ispell Web site may not be removed.
# 5. The name of Geoff Kuenning may not be used to endorse or promote
# products derived from this software without specific prior
# written permission.
#
# THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
# Given a list of words for ispell, generate a reduced list
# in which all possible affixes have been collapsed. The reduced
# list will match the same list as the original.
#
# Usage:
#
# munchlist [-l lang] [-c lang] [-s hashfile] [-D] [-w chars] [-v] \
# [file] ...
#
# Options:
#
# -l lang Specifies the language table to be used. The default
# is "$LIBDIR/default.aff".
# -c lang Specifies "conversion" language table. If this option is
# given, the input file(s) will be assumed to be described by
# this table, rather than the table given in the -l option.
# This may be used to convert between incompatible language
# tables. (When in doubt, use this option -- it doesn't
# hurt, and it may save you from creating a dictionary that has
# illegal words in it). The default is no conversion.
# -T suff Specifies that the source word lists are in the format
# of a "suff"-suffixed file, rather than in the
# canonical form. For example, "-T tex" specifies that
# string characters in the word lists are in TeX format.
# The string character conversions are taken from the language
# table specified by the "-l" switch.
# -s Remove any words that are already covered by the
# dictionary in 'hashfile'. The words will be removed
# only if all affixes are covered. This option should not be
# specified when the main dictionary is being munched.
# 'Hashfile' must have been created with the language
# table given in the -l option, but this is not checked.
# -D Leave temporary files for debugging purposes
# -w Passed on to ispell (specify chars that are part of a word)
# Unfortunately, special characters must be quoted twice
# rather than once when invoking this script. Also, since
# buildhash doesn't accept this option, the final ispell -l
# step ignores it, making it somewhat less than useful.
# -v Report progress to stderr.
#
# The given input files are merged, then processed by 'ispell -c'
# to generate possible affix lists; these are then combined
# and reduced. The final result is written to standard output.
#
# For portability to older systems, I have avoided getopt.
#
# Geoff Kuenning
# 2/28/87
#
# $Log: munchlist.X,v $
# Revision 1.70 2015-02-08 00:35:41-08 geoff
# Be a bit more paranoid about creating temporary files. Fix a problem
# with detecting a new-style sort that refuses to be backwards
# compatible (and yes, it's still cretinism to break backwards
# compatibility--but I have to put up with the cretins).
#
# Revision 1.69 2005/04/28 14:46:51 geoff
# Remove references to the now-obsolete count file.
#
# Revision 1.68 2005/04/27 01:18:34 geoff
# Work around idiotic POSIX incompatibilities in sort. Add secure
# temp-file handling.
#
# Revision 1.67 2005/04/14 23:11:36 geoff
# Pass the -w switch to icombine.
#
# Revision 1.66 2005/04/14 21:25:52 geoff
# Make the temporary-file handling safer (using mktemp, if it exists).
#
# Revision 1.65 2005/04/14 14:39:33 geoff
# Use /tmp as the default temp directory
#
# Revision 1.64 2005/04/14 14:38:23 geoff
# Update license. Protect against modernized (i.e., incompatible) and
# internationalized sort commands. Change the debugging names of the
# minimal-affixes count and stat files.
#
# Revision 1.63 2002/06/20 23:46:16 geoff
# Add yet more locale definitions so that we won't run into bugs caused
# by sorting inconsistencies.
#
# Revision 1.62 2001/09/06 00:30:28 geoff
# Many changes from Eli Zaretskii to support DJGPP compilation.
#
# Revision 1.61 2001/07/25 21:51:46 geoff
# Minor license update.
#
# Revision 1.60 2001/07/23 20:24:04 geoff
# Update the copyright and the license.
#
# Revision 1.59 2001/06/07 08:02:18 geoff
# Fix a copule of typos in comments.
#
# Revision 1.58 2000/11/14 07:27:04 geoff
# Don't generate an extra dot when attempting to preserve the count
# files in -D mode.
#
# Revision 1.57 2000/10/06 23:59:48 geoff
# Don't assume dot is in the path
#
# Revision 1.56 1999/01/07 01:22:42 geoff
# Update the copyright.
#
# Revision 1.55 1997/12/02 06:25:01 geoff
# Start the cross-expansions loop count at 1, not zero.
#
# Revision 1.54 1997/12/01 00:53:52 geoff
# Abort the munchlist cross-product loop if it goes over 100 passes.
#
# Revision 1.53 1995/01/08 23:23:36 geoff
# Support variable hashfile suffixes for DOS purposes.
#
# Revision 1.52 1994/12/27 23:08:46 geoff
# Dynamically determine how to pass backslashes to 'tr' so that it'll
# work on any machine. Define LC_CTYPE to work around yet more
# internationalized sort programs. Work around a bug in GNU uniq that
# uses the wrong separator between counts and duplicated lines.
#
# Revision 1.51 1994/11/21 07:02:54 geoff
# Correctly quote the arguments to 'tr' when detecting systems with
# unsigned sorts. Be sure to provide a zero exit status on all systems,
# even if MUNCHDEBUG is not set.
#
# Revision 1.50 1994/10/25 05:46:05 geoff
# Export values for LANG and LOCALE in an attempt to override some
# stupidly-internationalized sort programs.
#
# Revision 1.49 1994/10/04 03:51:30 geoff
# Add the MUNCHMAIL feature. If the MUNCHMAIL environment variable is
# set to an email address, debugging information about the munchlist run
# will automatically be collected and mailed to that address.
#
# Revision 1.48 1994/05/17 06:32:06 geoff
# Don't look for affix tables in LIBDIR if the name contains a slash
#
# Revision 1.47 1994/04/27 02:50:48 geoff
# Fix some cosmetic flaws in the verbose-mode messages.
#
# Revision 1.46 1994/01/25 07:11:59 geoff
# Get rid of all old RCS log lines in preparation for the 3.1 release.
#
#
LIBDIR=/usr/lib/ispell
TDIR=${TMPDIR-/tmp}
MUNCHDIR=`mktemp -d ${TDIR}/munchXXXXXXXXXX 2>/dev/null` || { echo "$0: Failed to create temporary directory, exiting..." 1>&2; exit 1; }
TMP=${MUNCHDIR}/munch.
MAILDEBUGDIR=${MUNCHDIR-/tmp}
if [ "X$MUNCHMAIL" != X ]
then
exec 2> ${MAILDEBUGDIR}/munchlist.mail
echo "munchlist $*" 1>&2
set -vx
fi
SORTTMP="-T ${TDIR}" # !!SORTTMP!!
DBDIR=${MUNCHDEBUGDIR-$MAILDEBUGDIR}
# Detect MS-DOS systems and arrange to use their silly suffix system
if [ -z "$COMSPEC$ComSpec" ]
then
EXE=""
else
EXE=".exe"
fi
#
# Set up some program names. This prefers the versions that are in
# the same directory as munchlist was run from; if that can't be
# figured out, it prefers local versions and finally ones chosen from
# $PATH.
#
# This code could be simplified by using the dirname command, but it's
# not available everywhere. For the same reason, we use -r rather than
# -x to test for executable files.
#
case "$0" in
*/*)
bindir=`expr "$0" : '\(.*\)/[^/]*'`
;;
*)
bindir='.'
;;
esac
if [ -r $bindir/buildhash$EXE ]
then
BUILDHASH=$bindir/buildhash$EXE
elif [ -r ./buildhash$EXE ]
then
BUILDHASH=./buildhash$EXE
else
BUILDHASH=buildhash
fi
if [ -r $bindir/icombine$EXE ]
then
COMBINE=$bindir/icombine$EXE
elif [ -r ./icombine$EXE ]
then
COMBINE=./icombine$EXE
else
COMBINE=icombine
fi
if [ -r $bindir/ijoin$EXE ]
then
JOIN=$bindir/ijoin$EXE
elif [ -r ./ijoin$EXE ]
then
JOIN=./ijoin$EXE
else
JOIN=ijoin
fi
if [ -r $bindir/ispell$EXE ]
then
ISPELL=$bindir/ispell$EXE
elif [ -r ./ispell$EXE ]
then
ISPELL=./ispell$EXE
else
ISPELL=ispell
fi
# In one of the most incredibly stupid decisions of all time, some
# genius decided to break backwards compatibility by "deprecating" the
# old-style sort switches even though it was trivial to recognize both
# styles. The result is that that thousands of people (like me) will
# have to rewrite shell scripts to tolerate that stupidity. (It's not
# that the new syntax is bad--it's definitely easier to understand.
# But that doesn't excuse breaking compatibility.)
#
CRETIN_SORT=true
#
# The following is necessary so that some internationalized versions of
# sort(1) don't confuse things by sorting into a nonstandard order.
#
LANG=C
LOCALE=C
LC_ALL=C
LC_COLLATE=C
LC_CTYPE=C
export LANG LOCALE LC_COLLATE LC_CTYPE
#
# The following aren't strictly necessary, but I've been made paranoid
# by problems with the stuff above. It can't hurt to set them to a
# sensible value.
LC_MESSAGES=C
LC_MONETARY=C
LC_NUMERIC=C
LC_TIME=C
export LC_MESSAGES LC_MONETARY LC_NUMERIC LC_TIME
debug=no
dictopt=
langtabs=${LIBDIR}/default.aff
convtabs=
strip=no
icflags=
verbose=false
# The following value of "wchars" is necessary to prevent ispell from
# receiving a null argument if -w is not specified. As long as "A" is
# a member of the existing character set, ispell will ignore the argument.
wchars=-wA
while [ $# != 0 ]
do
case "$1" in
-l)
case "$2" in
*/*)
langtabs=$2
;;
*)
if [ -r "$2" ]
then
langtabs="$2"
else
langtabs="${LIBDIR}/$2"
fi
;;
esac
if [ ! -r "$langtabs" ]
then
echo "Can't open language table '$2'" 1>&2
rm -rf $MUNCHDIR
exit 1
fi
shift
;;
-c)
if [ -r "$2" ]
then
convtabs="$2"
elif [ -r "${LIBDIR}/$2" ]
then
convtabs="${LIBDIR}/$2"
else
echo "Can't open conversion language table '$2'" 1>&2
rm -rf $MUNCHDIR
exit 1
fi
shift
;;
-s)
dictopt="-d $2"
strip=yes
shift
;;
-D)
debug=yes
;;
-T)
icflags="-T $2"
shift
;;
-v)
verbose=true
;;
-w)
wchars="-w$2"
shift
;;
--)
shift
break
;;
-)
break
;;
-*)
echo 'Usage: munchlist [-l lang] [-c lang] [-T suff] [-s hashfile] [-D] [-w chars] [-v] [file] ...' \
1>&2
rm -rf $MUNCHDIR
exit 2
;;
*)
break
;;
esac
shift
done
if [ "X$MUNCHMAIL" != X ]
then
verbose=true
debug=yes
fi
trap "rm -rf $MUNCHDIR; exit 1" 1 2 13 15
#
# Names of temporary files. This is just to make the code a little easier
# to read.
#
EXPANDEDINPUT=${TMP}a
STRIPPEDINPUT=${TMP}b
CRUNCHEDINPUT=${TMP}c
PRODUCTLIST=${TMP}d
EXPANDEDPAIRS=${TMP}e
LEGALFLAGLIST=${TMP}f
JOINEDPAIRS=${TMP}g
MINIMALAFFIXES=${TMP}h
CROSSROOTS=${TMP}i
CROSSEXPANDED=${TMP}j
CROSSPAIRS=${TMP}k
CROSSILLEGAL=${TMP}l
ILLEGALCOMBOS=${TMP}m
FAKEDICT=${TMP}n
# Ispell insists that hash files have a ".hash" suffix
FAKEHASH=${TMP}o.hash
AWKSCRIPT=${TMP}p
# If the file exists than we should exit with error
ERRORFLAGFILE=${TMP}z
if [ "$debug" = yes ]
then
touch $EXPANDEDINPUT $STRIPPEDINPUT $CRUNCHEDINPUT $PRODUCTLIST \
$EXPANDEDPAIRS $LEGALFLAGLIST $JOINEDPAIRS $MINIMALAFFIXES \
$CROSSROOTS $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $ILLEGALCOMBOS \
$FAKEDICT $FAKEHASH $AWKSCRIPT
rm -f ${DBDIR}/EXPANDEDINPUT ${DBDIR}/STRIPPEDINPUT \
${DBDIR}/CRUNCHEDINPUT ${DBDIR}/PRODUCTLIST ${DBDIR}/EXPANDEDPAIRS \
${DBDIR}/LEGALFLAGLIST ${DBDIR}/JOINEDPAIRS ${DBDIR}/MINIMALAFFIXES \
${DBDIR}/CROSSROOTS ${DBDIR}/CROSSEXPANDED ${DBDIR}/CROSSPAIRS \
${DBDIR}/CROSSILLEGAL ${DBDIR}/ILLEGALCOMBOS ${DBDIR}/FAKEDICT \
${DBDIR}/FAKEHASH.hash ${DBDIR}/AWKSCRIPT \
${DBDIR}/CROSSROOTS.[0-9]* ${DBDIR}/CROSSEXP.[0-9]* \
${DBDIR}/CROSSPAIRS.[0-9]* ${DBDIR}/CROSSILLEGAL.[0-9]*
ln $EXPANDEDINPUT ${DBDIR}/EXPANDEDINPUT
ln $STRIPPEDINPUT ${DBDIR}/STRIPPEDINPUT
ln $CRUNCHEDINPUT ${DBDIR}/CRUNCHEDINPUT
ln $PRODUCTLIST ${DBDIR}/PRODUCTLIST
ln $EXPANDEDPAIRS ${DBDIR}/EXPANDEDPAIRS
ln $LEGALFLAGLIST ${DBDIR}/LEGALFLAGLIST
ln $JOINEDPAIRS ${DBDIR}/JOINEDPAIRS
ln $MINIMALAFFIXES ${DBDIR}/MINIMALAFFIXES
ln $CROSSROOTS ${DBDIR}/CROSSROOTS
ln $CROSSEXPANDED ${DBDIR}/CROSSEXPANDED
ln $CROSSPAIRS ${DBDIR}/CROSSPAIRS
ln $CROSSILLEGAL ${DBDIR}/CROSSILLEGAL
ln $ILLEGALCOMBOS ${DBDIR}/ILLEGALCOMBOS
ln $FAKEDICT ${DBDIR}/FAKEDICT
ln $FAKEHASH ${DBDIR}/FAKEHASH.hash
ln $AWKSCRIPT ${DBDIR}/AWKSCRIPT
fi
run()
{
"$@" || {
status=$?
touch "$ERRORFLAGFILE"
echo "$@ failed with $status" > "$ERRORFLAGFILE"
exit $status
}
}
checkerrorflagfile()
{
[ ! -e "$ERRORFLAGFILE" ] || {
cat "$ERRORFLAGFILE" 1>&2
rm -rf $MUNCHDIR
exit 1
}
}
rm -f "$ERRORFLAGFILE"
JOIN="run $JOIN"
COMBINE="run $COMBINE"
ISPELL="run $ISPELL"
set -e
#
# Create a dummy dictionary to hold a compiled copy of the language
# table. Initially, it holds the conversion table, if it exists.
#
case "X$convtabs" in
X)
convtabs="$langtabs"
;;
esac
echo 'QQQQQQQQ' > $FAKEDICT
$BUILDHASH -s $FAKEDICT $convtabs $FAKEHASH \
|| (echo "Couldn't create fake hash file" 1>&2; rm -rf $MUNCHDIR; exit 1) \
|| exit 1
#
# Figure out how 'sort' sorts signed fields, for arguments to ijoin.
# This is a little bit of a tricky pipe, but the result is that SIGNED
# is set to "-s" if characters with the top bit set sort before those
# without, and "-u" if the reverse is true. How does it work? The
# first "tr" step generates two lines, one containing "-u", the other
# with the same but with the high-order bit set. The second "tr"
# changes the high-bit "-u" back to "-s". If the high-bit "-u" was
# sorted first, the sed step will select "-s" for SIGNED; otherwise
# it'll pick "-u". We have to be careful about backslash quoting
# conventions, because some systems differ.
#
backslash=\\
for i in 0 1 2 3
do
if [ `echo a | tr "${backslash}141" b` = b ]
then
break
fi
backslash="$backslash$backslash"
done
SIGNED=`echo '-s
-u' | tr s "${backslash}365" | sort | tr "${backslash}365" s | sed -e 1q`
#
# Collect all the input and expand all the affix options ($ISPELL -e),
# and preserve (sorted) for later joining in EXPANDEDINPUT. The icombine
# step is to make sure that unneeded capitalizations (e.g., Farmer and farmer)
# are weeded out. The first sort must be folded for icombine; the second
# must be unfolded for join.
#
$verbose && echo "Collecting input." 1>&2
if $CRETIN_SORT
then
sortopts='-k 1f,1 -k 1'
else
sortopts='+0f -1 +0'
fi
if [ $# -eq 0 ]
then
$ISPELL "$wchars" -e1 -d $FAKEHASH -p /dev/null | tr " " '
'
else
cat "$@" | $ISPELL "$wchars" -e1 -d $FAKEHASH -p /dev/null | tr " " '
'
fi \
| sort $SORTTMP -u $sortopts \
| $COMBINE $icflags "$wchars" $langtabs \
| sort $SORTTMP -u > $EXPANDEDINPUT
checkerrorflagfile
#
# If a conversion table existed, recreate the fake hash file with the
# "real" language table.
#
case "$convtabs" in
$langtabs)
;;
*)
$BUILDHASH -s $FAKEDICT $langtabs $FAKEHASH \
|| (echo "Couldn't create fake hash file" 1>&2; \
rm -rf $MUNCHDIR; exit 1) \
|| exit 1
;;
esac
rm -f ${FAKEDICT}*
#
# If the -s (strip) option was specified, remove all
# expanded words that are covered by the dictionary. This produces
# the final list of expanded words that this dictionary must cover.
# Leave the list in STRIPPEDINPUT.
#
if [ "X$strip" = "Xno" ]
then
rm -f $STRIPPEDINPUT
ln $EXPANDEDINPUT $STRIPPEDINPUT
if [ "$debug" = yes ]
then
rm -f ${DBDIR}/STRIPPEDINPUT
ln $STRIPPEDINPUT ${DBDIR}/STRIPPEDINPUT
fi
else
$verbose && echo "Stripping words already in the dictionary." 1>&2
$ISPELL "$wchars" -l $dictopt -p /dev/null < $EXPANDEDINPUT \
> $STRIPPEDINPUT
checkerrorflagfile
fi
#
# Figure out what the flag-marking character is.
#
$verbose && echo "Finding flag marker." 1>&2
flagmarker=`$ISPELL -D -d $FAKEHASH \
| sed -n -e '/^flagmarker/s/flagmarker //p'`
case "$flagmarker" in
\\*)
flagmarker=`expr "$flagmarker" : '.\(.\)'`
;;
esac
checkerrorflagfile
#
# Munch the input to generate roots and affixes ($ISPELL -c). We are
# only interested in words that have at least one affix (grep -E $flagmarker);
# the next step will pick up the rest. Some of the roots are illegal. We
# use join to restrict the output to those root words that are found
# in the original dictionary.
#
$verbose && echo "Generating roots and affixes." 1>&2
if $CRETIN_SORT
then
sortopts='-k 1,1 -k 2'
else
sortopts='+0 -1 +1'
fi
$ISPELL "$wchars" -c -W0 -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT \
| tr " " '
' \
| grep -a -E "$flagmarker" | sort $SORTTMP -u "-t$flagmarker" $sortopts \
| $JOIN $SIGNED "-t$flagmarker" - $EXPANDEDINPUT > $CRUNCHEDINPUT
checkerrorflagfile
#
# We now have a list of legal roots, and of affixes that apply to the
# root words. However, it is possible for some affix flags to generate more
# than one output word. For example, with the flag table entry
#
# flag R: . > ER
# . > ERS
#
# the input "BOTHER" will generate an entry "BOTH/R" in CRUNCHEDINPUT. But
# this will accept "BOTHER" and "BOTHERS" in the dictionary, which is
# wrong (in this case, though it's good English).
#
# To cure this problem, we first have to know which flags generate which
# expansions. We use $ISPELL -e3 to expand the flags (the second e causes
# the root and flag to be included in the output), and get pairs
# suitable for joining. In the example above, we would get
#
# BOTH/R BOTHER
# BOTH/R BOTHERS
#
# We save this in EXPANDEDPAIRS for the next step.
#
$verbose && echo 'Expanding dictionary into EXPANDEDPAIRS.' 1>&2
if $CRETIN_SORT
then
sortopts='-k 2'
else
sortopts='+1'
fi
$ISPELL "$wchars" -e3 -d $FAKEHASH -p /dev/null < $CRUNCHEDINPUT \
| sort $SORTTMP $sortopts > $EXPANDEDPAIRS
checkerrorflagfile
#
# Now we want to extract the lines in EXPANDEDPAIRS in which the second field
# is *not* listed in the original dictionary EXPANDEDINPUT; these illegal
# lines contain the flags we cannot include without accepting illegal words.
# It is somewhat easier to extract those which actually are listed (with
# join), and then use comm to strip these from EXPANDEDPAIRS to get the
# illegal expansions, together with the flags that generate them (we must
# re-sort EXPANDEDPAIRS before running comm). Sed
# gets rid of the expansion and uniq gets rid of duplicates. Comm then
# selects the remainder of the list from CRUNCHEDINPUT and puts it in
# LEGALFLAGLIST. The final step is to use a sort and icombine to put
# the list into a one-entry-per-root format.
#
# BTW, I thought of using cut for the sed step (on systems that have it),
# but it turns out that sed is faster!
#
$JOIN -j1 2 -o 1.1 1.2 $SIGNED $EXPANDEDPAIRS $EXPANDEDINPUT \
| sort $SORTTMP -u > $JOINEDPAIRS
sort $SORTTMP -o $EXPANDEDPAIRS $EXPANDEDPAIRS
sort $SORTTMP -o $CRUNCHEDINPUT $CRUNCHEDINPUT
$verbose && echo 'Creating list of legal roots/flags.' 1>&2
if $CRETIN_SORT
then
sortopts='-k 1f,1 -k 1'
else
sortopts='+0f -1 +0'
fi
comm -13 $JOINEDPAIRS $EXPANDEDPAIRS \
| (sed -e 's; .*$;;' ; rm -f $JOINEDPAIRS $EXPANDEDPAIRS) \
| uniq \
| (comm -13 - $CRUNCHEDINPUT ; rm -f $CRUNCHEDINPUT) \
| sort $SORTTMP -u "-t$flagmarker" $sortopts \
| $COMBINE "$wchars" $langtabs > $LEGALFLAGLIST
checkerrorflagfile
#
# LEGALFLAGLIST now contains root/flag combinations that, when expanded,
# produce only words from EXPANDEDPAIRS. However, there is still a
# problem if the language tables have any cross-product flags. A legal
# root may appear in LEGALFLAGLIST with two flags that participate
# in cross-products. When such a dictionary entry is expanded,
# the cross-products will generate some extra words that may not
# be in EXPANDEDPAIRS. We need to remove these from LEGALFLAGLIST.
#
# The first step is to collect the names of the flags that participate
# in cross-products. Ispell will dump the language tables for us, and
# sed is a pretty handy way to strip out extra information. We use
# uniq -c and a numerical sort to put the flags in approximate order of how
# "productive" they are (in terms of how likely they are to generate a lot
# of output words). The least-productive flags are given last and will
# be removed first.
#
$verbose \
&& echo 'Creating list of flags that participate in cross-products.' 1>&2
if $CRETIN_SORT
then
sortopts='-k 1rn,1 -k 3'
else
sortopts='+0rn -1 +2'
fi
$ISPELL -D -d $FAKEHASH \
| sed -n -e '1,$s/:.*$//
/^flagmarker/d
/^prefixes/,/^suffixes/s/^ flag \*/p /p
/^suffixes/,$s/^ flag \*/s /p' \
| sort $SORTTMP \
| uniq -c \
| tr ' ' ' ' \
| sort $SORTTMP $sortopts > $PRODUCTLIST
checkerrorflagfile
if [ `grep -a -F -c ' p ' $PRODUCTLIST` -gt 0 \
-a `grep -a -F -c ' s ' $PRODUCTLIST` -gt 0 ]
then
#
# The language tables allow cross products. See if LEGALFLAGLIST has
# any roots with multiple cross-product flags. Put them in CROSSROOTS.
#
$verbose && echo 'Finding prefix and suffix flags.' 1>&2
preflags=`sed -n -e 's/^[ 0-9]*p //p' $PRODUCTLIST | tr -d '
'`
sufflags=`sed -n -e 's/^[ 0-9]*s //p' $PRODUCTLIST | tr -d '
'`
grep -a -E "$flagmarker.*[$preflags].*[$sufflags]|$flagmarker.*[$sufflags].*[$preflags]" \
$LEGALFLAGLIST \
> $CROSSROOTS || :
#
# We will need an awk script; it's so big that it core-dumps my shell
# under certain conditions. The rationale behind the script is commented
# where the script is used. Note that you may want to change this
# script for languages other than English.
#
case "$flagmarker" in
/)
sedchar=:
;;
*)
sedchar=/
;;
esac
$verbose && echo 'Creating awk script.' 1>&2
sed -e "s/PREFLAGS/$preflags/" -e "s/SUFFLAGS/$sufflags/" \
-e "s;ILLEGALCOMBOS;$ILLEGALCOMBOS;" \
-e "s${sedchar}FLAGMARKER${sedchar}$flagmarker${sedchar}" \
> $AWKSCRIPT << 'ENDOFAWKSCRIPT'
BEGIN \
{
preflags = "PREFLAGS"
sufflags = "SUFFLAGS"
illegalcombos = "ILLEGALCOMBOS"
flagmarker = "FLAGMARKER"
pflaglen = length (preflags)
for (i = 1; i <= pflaglen; i++)
pflags[i] = substr (preflags, i, 1);
sflaglen = length (sufflags)
for (i = 1; i <= sflaglen; i++)
sflags[i] = substr (sufflags, i, 1);
}
{
len = length ($2)
pnew2 = ""
snew2 = ""
pbad = ""
sbad = ""
sufs = 0
pres = 0
for (i = 1; i <= len; i++)
{
curflag = substr ($2, i, 1)
for (j = 1; j <= pflaglen; j++)
{
if (pflags[j] == curflag)
{
pres++
pnew2 = substr ($2, 1, i - 1) substr ($2, i + 1)
pbad = curflag
}
}
for (j = 1; j <= sflaglen; j++)
{
if (sflags[j] == curflag)
{
sufs++
snew2 = substr ($2, 1, i - 1) substr ($2, i + 1)
sbad = curflag
}
}
}
if (pres == 1)
{
print $1 flagmarker pnew2
print $1 flagmarker pbad >> illegalcombos
}
else if (sufs == 1)
{
print $1 flagmarker snew2
print $1 flagmarker sbad >> illegalcombos
}
else if (pres > 0)
{
print $1 flagmarker pnew2
print $1 flagmarker pbad >> illegalcombos
}
else
{
print $1 flagmarker snew2
print $1 flagmarker sbad >> illegalcombos
}
}
ENDOFAWKSCRIPT
: > $ILLEGALCOMBOS
dbnum=1
while [ -s $CROSSROOTS ]
do
#
# CROSSROOTS contains the roots whose cross-product expansions
# might be illegal. We now need to locate the actual illegal ones.
# We do this in much the same way we created LEGALFLAGLIST from
# CRUNCHEDINPUT. First we make CROSSEXPANDED, which is analogous
# to EXPANDEDPAIRS.
#
$verbose && echo "Creating cross expansions (pass $dbnum)." 1>&2
if $CRETIN_SORT
then
sortopts='-k 2'
else
sortopts='+1'
fi
$ISPELL "$wchars" -e3 -d $FAKEHASH -p /dev/null < $CROSSROOTS \
| sort $SORTTMP $sortopts > $CROSSEXPANDED
checkerrorflagfile
#
# Now we join CROSSEXPANDED against EXPANDEDINPUT to produce
# CROSSPAIRS, and then comm that against CROSSEXPANDED to
# get CROSSILLEGAL, the list of illegal cross-product flag
# combinations.
#
$JOIN -j1 2 -o 1.1 1.2 $SIGNED $CROSSEXPANDED $EXPANDEDINPUT \
| sort $SORTTMP -u > $CROSSPAIRS
checkerrorflagfile
sort $SORTTMP -u -o $CROSSEXPANDED $CROSSEXPANDED
$verbose \
&& echo "Finding illegal cross expansions (pass $dbnum)." 1>&2
comm -13 $CROSSPAIRS $CROSSEXPANDED \
| sed -e 's; .*$;;' \
| uniq > $CROSSILLEGAL
if [ "$debug" = yes ]
then
mv $CROSSROOTS $DBDIR/CROSSROOTS.$dbnum
ln $CROSSEXPANDED $DBDIR/CROSSEXP.$dbnum
ln $CROSSPAIRS $DBDIR/CROSSPAIRS.$dbnum
ln $CROSSILLEGAL $DBDIR/CROSSILLEGAL.$dbnum
fi
#
# Now it is time to try to clear up the illegalities. For
# each word in the illegal list, remove one of the cross-product
# flags. The flag chosen is selected in an attempt to cure the
# problem quickly, as follows: (1) if there is only one suffix
# flag or only one prefix flag, we remove that. (2) If there is
# a prefix flag, we remove the "least desirable" (according to
# the order of preflags). (This may be pro-English prejudice,
# and you might want to change this if your language is prefix-heavy).
# (3) Otherwise we remove the least-desirable suffix flag
#
# The output of the awk script becomes the new CROSSROOTS. In
# addition, we add the rejected flags to ILLEGALCOMBOS (this is done
# inside the awk script) so they can be removed from LEGALFLAGLIST
# later.
#
awk "-F$flagmarker" -f $AWKSCRIPT $CROSSILLEGAL > $CROSSROOTS
if [ "$debug" = yes ]
then
rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL
fi
dbnum=`expr $dbnum + 1`
if [ $dbnum -gt 100 ]
then
echo "Too many passes, aborting cross-product loop. Munchlist failed." 1>&2
if [ "X$MUNCHMAIL" != X ]
then
(
ls -ld ${DBDIR}/[A-Z]*
cat ${MAILDEBUGDIR}/munchlist.mail
) | mail -s 'Munchlist debug output' "$MUNCHMAIL"
rm -f ${MAILDEBUGDIR}/munchlist.mail
fi
rm -rf $MUNCHDIR
exit 1
fi
done
rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $AWKSCRIPT
#
# Now we have, in ILLEGALCOMBOS, a list of root/flag combinations
# that must be removed from LEGALFLAGLIST to get the final list
# of truly legal flags. ILLEGALCOMBOS has one flag per line, so
# by turning LEGALFLAGLIST into this form (sed), it's an
# easy task for comm. We have to recombine flags again after the
# extraction, to get all flags for a given root on the same line so that
# cross-products will come out right.
#
if [ -s $ILLEGALCOMBOS ]
then
sort $SORTTMP -u -o $ILLEGALCOMBOS $ILLEGALCOMBOS
$verbose && echo 'Finding roots of cross expansions.' 1>&2
if $CRETIN_SORT
then
sortopts='-k 1f,1 -k 1'
else
sortopts='+0f -1 +0'
fi
sort $SORTTMP $LEGALFLAGLIST \
| sed -e '/\/../{
s;^\(.*\)/\(.\)\(.*\);\1/\2\
\1/\3;
P
D
}' \
| comm -23 - $ILLEGALCOMBOS \
| sort $SORTTMP -u "-t$flagmarker" $sortopts \
| $COMBINE "$wchars" $langtabs > $CROSSROOTS
checkerrorflagfile
mv $CROSSROOTS $LEGALFLAGLIST
if [ "$debug" = yes ]
then
rm -f ${DBDIR}/LEGALFLAGLIST1
ln $LEGALFLAGLIST ${DBDIR}/LEGALFLAGLIST1
fi
fi
fi
rm -f $PRODUCTLIST $CROSSROOTS $ILLEGALCOMBOS $EXPANDEDINPUT
#
# We now have (in LEGALFLAGLIST) a list of roots and flags which will
# accept words taken from EXPANDEDINPUT and no others (though some of
# EXPANDEDINPUT is not covered by this list). However, many of the
# expanded words can be generated in more than one way. For example,
# "bather" can be generated from "bath/R" and "bathe/R". This wastes
# unnecessary space in the raw dictionary and, in some cases, in the
# hash file as well. The solution is to list the various ways of
# getting a given word and choose exactly one. All other things being
# equal, we want to choose the one with the highest expansion length
# to root length ratio. The $ISPELL -e4 option takes care of this by
# providing us with a field to sort on.
#
# The ispell/awk combination is similar to the ispell/sed pipe used to
# generate EXPANDEDPAIRS, except that ispell adds an extra field
# giving the sort order. The first sort gets things in order so the
# first root listed is the one we want, and the second sort (-um) then
# selects that first root. Sed strips the expansion from the root,
# and a final sort -u generates MINIMALAFFIXES, the final list of
# affixes that (more or less) minimally covers what it can from
# EXPANDEDINPUT.
#
$verbose && echo 'Eliminating non-optimal affixes.' 1>&2
if $CRETIN_SORT
then
sortopts1='-k 2,2 -k 2rn,3 -k 1,1'
sortopts2='-k 2,2'
sortopts3='-k 1f,1 -k 1'
else
sortopts1='+1 -2 +2rn -3 +0 -1'
sortopts2='+1 -2'
sortopts3='+0f -1 +0'
fi
$ISPELL "$wchars" -e4 -d $FAKEHASH -p /dev/null < $LEGALFLAGLIST \
| sort $SORTTMP $sortopts1 \
| sort $SORTTMP -um $sortopts2 \
| sed -e 's; .*$;;' \
| sort $SORTTMP -u "-t$flagmarker" $sortopts3 > $MINIMALAFFIXES
rm -f $LEGALFLAGLIST
#
# Now we're almost done. MINIMALAFFIXES covers some (with luck, most)
# of the words in STRIPPEDINPUT. Now we must create a list of the remaining
# words (those omitted by MINIMALAFFIXES) and add it to MINIMALAFFIXES.
# The best way to do this is to actually build a partial dictionary from
# MINIMALAFFIXES in FAKEHASH, and then use $ISPELL -l to list the words that
# are not covered by this dictionary. This must then be combined with the
# reduced version of MINIMALAFFIXES and sorted to produce the final result.
#
$verbose && echo "Generating output word list." 1>&2
if $CRETIN_SORT
then
sortopts='-k 1f,1 -k 1'
else
sortopts='+0f -1 +0'
fi
if [ -s $MINIMALAFFIXES ]
then
$BUILDHASH -s $MINIMALAFFIXES $langtabs $FAKEHASH > /dev/null \
|| (echo "Couldn't create intermediate hash file" 1>&2;
rm -rf $MUNCHDIR;
exit 1) \
|| exit 1
if [ "$debug" = yes ]
then
rm -f ${DBDIR}/MINIMALAFFIXES.stat
ln $MINIMALAFFIXES.stat ${DBDIR}/MINIMALAFFIXES..stat
fi
($ISPELL "$wchars" -l -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT; \
$COMBINE "$wchars" $langtabs < $MINIMALAFFIXES) \
| sort $SORTTMP "-t$flagmarker" -u $sortopts
else
# MINIMALAFFIXES is empty; just produce a sorted version of STRIPPEDINPUT
sort $SORTTMP "-t$flagmarker" -u $sortopts $STRIPPEDINPUT
fi
checkerrorflagfile
if [ "X$MUNCHMAIL" != X ]
then
(
ls -ld ${DBDIR}/[A-Z]*
cat ${MAILDEBUGDIR}/munchlist.mail
) | mail -s 'Munchlist debug output' "$MUNCHMAIL"
rm -f ${MAILDEBUGDIR}/munchlist.mail
fi
rm -rf $MUNCHDIR
exit 0
|