#!/bin/sh
#
# $Id: findaffix.X,v 1.23 2015-02-08 00:35:41-08 geoff Exp $
#
# Copyright 1992, 1993, 1999, 2001, 2005, Geoff Kuenning, Claremont, CA
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. All modifications to the source code must be clearly marked as
# such. Binary redistributions based on modified source code
# must be clearly marked as modified versions in the documentation
# and/or other materials provided with the distribution.
# 4. The code that causes the 'ispell -v' command to display a prominent
# link to the official ispell Web site may not be removed.
# 5. The name of Geoff Kuenning may not be used to endorse or promote
# products derived from this software without specific prior
# written permission.
#
# THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
# Find possible affixes for use with ispell
#
# Usage:
#
# findaffix [-p | -s] [-f] [-c] [-m min] [-M max] [-e elim] [-l low] \
# [-t tabchar] [files]
#
# Each common prefix (-p) or suffix (-s, default) is presented, along
# with statistics to indicate how useful such an affix might be in
# reducing the size of the input file. Only those affixes which
# produce a legal root (one found in the original input) are reported.
#
# If the "-c" option is not given, the output lines are in the
# following format:
#
# strip/add/count/bytes
#
# where "strip" is the string that should be stripped from a root
# word before adding the affix, "add" is the affix to be added, "count"
# is a count of the number of times that this "strip/add" combination
# appears, and "bytes" is an estimate of the number of bytes that
# will be saved in the raw dictionary file if this combination is
# added to the affix file. The field separator in the output will
# normally be the tab character specified by the "-t" switch; the
# default is a slash ("/").
#
# If the "-c" ("clean output") option is given, the appearance of
# the output is made cleaner by changing it to:
#
# -strip+add<tab>count<tab>bytes
#
# where "strip," "add," "count," and "bytes" are as before, and "<tab>"
# represents the ASCII tab character.
#
# The method used to generate possible affixes will also generate
# longer affixes which have common headers or trailers. For example,
# the two words "moth" and "mother" will generate not only the obvious
# substition "+er" but also "-h+her" and "-th+ther" (and possibly
# even longer ones, depending on the value of "min"). To prevent
# cluttering the output with such affixes, any affix pair that shares
# a common header (or, for prefixes, trailer) string longer than
# "elim" characters (default 1) will be suppressed. You may want to
# set "elim" to a value greater than 1 if your language has string
# characters; usually the need for this parameter will become obvious
# when you examine the output of your findaffix run.
#
# Normally, the output is sorted on the "bytes" field. If the "-f"
# flag is given, the output is sorted according to the "count" field.
#
# No affix longer than "max" characters (default 8) will be reported.
# Smaller values of "max" will make the script run faster.
#
# Affixes which appear fewer than "low" times (default 10) are
# suppressed. This significantly reduces the size of the output file.
#
# Affixes which generate stems shorter than "min" characters (default 3)
# are suppressed. (A stem is the word after the "strip" string has
# been removed, and before the "add" string has been added.) This
# reduces both the running time and the size of the output file. "Min"
# should only be set to 1 if you have a *lot* of free time and disk
# space.
#
# The script requires a non-blank field-separator character for internal
# use. Normally, this character is a slash ("/"), but if the slash
# appears as a character in the input word list, a different character
# can be specified with the "-t" switch.
#
# If the input files are ispell dictionaries, they should be expanded
# before being fed to this script.
#
# If the input files contains characters other than [A-Za-z], they
# should be translated to lowercase before being fed to this script.
#
# $Log: findaffix.X,v $
# Revision 1.23 2015-02-08 00:35:41-08 geoff
# Be a bit more paranoid about creating temporary files.
#
# Revision 1.22 2005/04/27 01:18:34 geoff
# Work around idiotic POSIX incompatibilities in sort. Add secure
# temp-file handling.
#
# Revision 1.21 2005/04/14 14:39:33 geoff
# Use /tmp as the default temp directory
#
# Revision 1.20 2005/04/14 14:38:23 geoff
# Update license. Protect against modernized (i.e., incompatible) and
# internationalized sort commands.
#
# Revision 1.19 2001/09/06 00:30:28 geoff
# Many changes from Eli Zaretskii to support DJGPP compilation.
#
# Revision 1.18 2001/07/25 21:51:46 geoff
# Minor license update.
#
# Revision 1.17 2001/07/23 20:24:03 geoff
# Update the copyright and the license.
#
# Revision 1.16 1999/01/07 01:22:55 geoff
# Update the copyright.
#
# Revision 1.15 1994/01/25 07:11:29 geoff
# Get rid of all old RCS log lines in preparation for the 3.1 release.
#
#
# In one of the most incredibly stupid decisions of all time, some
# genius decided to break backwards compatibility by "deprecating" the
# old-style sort switches even though it was trivial to recognize both
# styles. The result is that that thousands of people (like me) will
# have to rewrite shell scripts to tolerate that stupidity. (It's not
# that the new syntax is bad--it's definitely easier to understand.
# But that doesn't excuse breaking compatibility.)
#
CRETIN_SORT=true
#
# The following is necessary so that some internationalized versions of
# sort(1) don't confuse things by sorting into a nonstandard order.
#
LANG=C
LOCALE=C
LC_ALL=C
LC_COLLATE=C
LC_CTYPE=C
export LANG LOCALE LC_COLLATE LC_CTYPE
#
# The following aren't strictly necessary, but I've been made paranoid
# by problems with the stuff above. It can't hurt to set them to a
# sensible value.
LC_MESSAGES=C
LC_MONETARY=C
LC_NUMERIC=C
LC_TIME=C
export LC_MESSAGES LC_MONETARY LC_NUMERIC LC_TIME
TDIR=${TMPDIR-/tmp}
TEMPDIR=`mktemp -d ${TDIR}/faffXXXXXXXXXX 2>/dev/null` || { echo "$0: Failed to create temporary directory; exiting..." 1>&2; exit 1; }
TMP=${TEMPDIR}/faff.
SORTTMP="-T ${TDIR}" # !!SORTTMP!!
USAGE='Usage: findaffix [-p | -s] [-f] [-c] [-e elim] [-m min] [-M max] [-l low] [-t tabch] [files]'
LOOP='
i = len - maxlim + 1
if (i < minstem + 1)
i = minstem + 1
for ( ; i <= len; i++)
print substr ($0, 1, i - 1) tabch substr ($0, i) tabch len
print $0 tabch tabch len'
ELIM='$1!=$2 \
{
if (substr ($1, 1, elimlen) != substr ($2, 1, elimlen))
print
}'
maxlim=8
minstem=3
elimlen=1
lowcount=10
cleanout=no
if $CRETIN_SORT
then
finalsortopts='-k 4rn,4 -k 3rn,3 -k 2,2 -k 1,1'
else
finalsortopts='+3rn -4 +2rn -3 +1 -2 +0 -1'
fi
tabch=/
while :
do
case "$1" in
-p)
LOOP='
lim = len - minstem
if (lim > maxlim)
lim = maxlim
for (i = 1; i <= lim; i++)
print substr ($0, i + 1) tabch substr ($0, 1, i) tabch len
print $0 tabch tabch len'
ELIM='$1!=$2 \
{
if (substr ($1, length ($1), elimlen) \
!= substr ($2, length ($2), elimlen))
print
}'
shift
;;
-s)
shift
;;
-f)
if $CRETIN_SORT
then
finalsortopts='-k 3rn,3 -k 4rn,4 -k 2,2 -k 1,1'
else
finalsortopts='+2rn -3 +3rn -4 +1 -2 +0 -1'
fi
shift
;;
-c)
cleanout=yes
shift
;;
-e)
elimlen=$2
shift; shift
;;
-m)
minstem=$2
shift; shift
;;
-M)
maxlim=$2
shift; shift
;;
-l)
lowcount=$2
shift; shift
;;
-t)
tabch="$2"
shift; shift
;;
-*)
echo "$USAGE" 1>&2
exit 1
;;
*)
break
;;
esac
done
trap "rm -rf $TEMPDIR; exit 1" 1 2 15
trap "rm -rf $TEMPDIR; exit 0" 13
#
# We are ready to do the work. First, we collect all input, translate it
# to lowercase, sort it (dropping duplications), and save it for later.
#
if [ $# -ne 0 ]
then
cat "$@" | tr '[A-Z]' '[a-z]'
else
tr '[A-Z]' '[a-z]'
fi \
| sort -u $SORTTMP > ${TMP}a
#
# Now the monstrous pipeline. The awk command produces several lines for
# each input word. Each line contains a possible stem (first field),
# a possible affix, and the length of the original word. The loop which
# does this was placed into the LOOP variable by the code above (q.v.).
#
# The first sort puts this output into an order appropriate for feeding
# to 'join'. The join command then combines stems and affixes, and for
# each puts out an affix to strip, an affix to add, and the length of
# the word before and after modification.
#
# From here on out the job is relatively easy. The second 'awk' gets rid
# of lines that have the same strip and add affixes, and also eliminates
# lines where the strip and add affix have a common leading (for suffixes)
# or trailing (for prefixes) substring, or where the strip affix is longer
# than the add affix (this is all done by the $ELIM variable, which is also
# set up by the code above. The second sort collects identical affixes;
# the third 'awk' functions like 'uniq -c', replacing duplicate affixes
# with a count and summing the estimate of bytes saved. It also eliminates
# any affixes which appear less frequently than the minimum ("lowcount").
# Finally, the third sort ($finalsortopts) rearranges the list in the chosen
# sort order.
#
if $CRETIN_SORT
then
sortopts1='-k 1,1 -k 2'
sortopts2='-k 2,2 -k 1,1'
else
sortopts1='+0 -1 +1'
sortopts2='+1 -2 +0 -1'
fi
awk "BEGIN{minstem=$minstem; maxlim=$maxlim; tabch="'"'"$tabch"'"}
{
len = length ($0)
if (len < 2)
next
'"$LOOP"'
}' < ${TMP}a \
| sort "-t$tabch" $sortopts1 $SORTTMP -o ${TMP}a
join "-t$tabch" -o 1.2 2.2 2.3 ${TMP}a ${TMP}a \
| awk "-F$tabch" "BEGIN{elimlen=$elimlen}$ELIM" \
| sort "-t$tabch" $sortopts2 $SORTTMP \
| awk "-F$tabch" 'BEGIN{tabch="'"$tabch"'"; lowcount='"$lowcount"'}
{
if ($1 == last1 && $2 == last2)
{
count++
totchars += $3
}
else
{
if ((last1 != "" || last2 != "") && count >= lowcount)
print last1 tabch last2 tabch count tabch totchars
count = 1
last1 = $1
last2 = $2
totchars = $3
}
}
END {
if ((last1 != "" || last2 != "") && count >= lowcount)
print last1 tabch last2 tabch count tabch totchars
}' \
| sort "-t$tabch" $finalsortopts $SORTTMP \
| if [ "$cleanout" = "yes" ]
then
case "$tabch" in
/)
sedsub=/
sedsep=';'
;;
.|\*|\[|\^|\$|\\)
sedsub="\\$tabch"
sedsep=/
;;
*)
sedsub="$tabch"
sedsep=/
;;
esac
exec sed -e "s$sedsep$sedsub$sedsep ${sedsep}g" \
-e 's/ /+/' -e 's/^/-/' \
-e 's/^-+/+/' -e 's/+ / /'
else
exec cat
fi
rm -rf $TEMPDIR
|