#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
  #include "translate.h" // alphabet translation tables
#include "cleanl.h"    // header file for clean language class
#include "gestalt.h"
  // Database.
static const unsigned char data[] =
{
224,105,247,13,163,30,149,0,90,178,30,149,30,150,37,0,224,105,
247,13,163,54,195,180,0,103,71,205,30,190,37,163,54,195,180,0,
224,105,247,13,163,0,190,37,163,30,8,103,71,205,30,0,224,150,
178,54,90,0,150,90,247,105,105,0,224,184,247,150,150,222,0,163,
54,90,90,30,195,0,224,84,247,103,103,150,178,54,90,0,69,71,
71,8,69,71,71,0,224,190,71,90,178,30,149,105,247,13,163,30,
149,0,105,37,190,54,103,222,8,180,247,222,0,224,37,150,150,178,
71,103,30,0,84,71,43,71,0,224,190,71,90,178,30,149,105,247,
13,163,54,195,180,0,105,247,195,8,150,30,30,163,54,195,180,0,
224,84,54,90,13,178,0,105,30,190,37,103,30,8,69,71,180,0,
224,13,71,13,163,150,247,13,163,30,149,0,184,103,30,37,150,247,
149,30,8,180,54,205,30,149,0,224,105,247,13,163,30,69,0,184,
30,195,30,90,149,37,90,30,69,0,224,105,247,13,163,30,69,0,
54,195,8,149,30,37,103,103,222,8,69,30,30,184,8,90,149,71,
247,84,103,30,0,224,37,150,150,178,71,103,30,0,84,54,180,8,
69,71,195,163,30,222,0,224,84,54,90,13,178,0,195,54,13,30,
8,180,247,222,0,224,105,247,163,0,150,178,71,205,30,0,224,105,
247,13,0,150,178,71,205,30,0,224,195,54,180,180,30,149,0,180,
149,30,37,90,8,180,247,222,0,224,184,247,150,150,222,0,195,54,
13,30,8,180,247,222,0,224,150,178,54,90,0,13,149,37,184,0,
224,150,178,222,90,0,13,149,37,184,0,224,195,30,180,30,149,0,
195,54,13,30,8,180,247,222,0,184,149,54,13,163,0,150,178,37,
149,184,8,184,37,54,195,0,178,30,103,103,0,178,30,13,163,0,
69,37,190,195,0,69,37,149,195,0,13,247,195,90,0,163,54,90,
90,222,0,184,54,150,150,0,128,71,151,244,0,150,178,54,90,178,
30,37,69,0,184,54,195,178,30,37,69,0,180,71,69,69,37,190,
195,30,69,0,180,71,150,178,8,69,37,149,195,0,84,54,90,13,
178,54,195,180,0,200,178,54,195,54,195,180,0,84,37,150,90,37,
149,69,0,105,37,90,178,30,149,103,30,150,150,0,37,150,150,0,
69,71,195,163,30,222,0,13,103,54,90,0,90,71,222,0,150,247,
13,163,150,0,54,150,8,103,37,190,30,0,13,71,13,163,0,149,
71,71,150,90,30,149,0,69,54,13,163,0,134,54,13,178,37,149,
69,0,69,54,13,163,178,30,37,69,0,134,54,13,178,37,149,69,
155,150,8,195,71,180,180,54,195,0,13,247,190,0,103,54,22,247,
54,69,0,84,103,71,200,239,71,84,0,54,195,105,103,37,90,54,
71,195,37,149,222,8,30,190,184,103,71,222,190,30,195,90,0,90,
54,90,0,84,71,71,84,54,30,0,90,54,90,150,0,184,54,103,
103,71,200,150,0,84,37,103,103,150,0,84,149,37,205,30,149,222,
0,205,37,180,54,195,37,0,103,37,149,180,30,8,71,184,30,195,
54,195,180,0,184,30,195,54,150,0,69,71,103,184,178,54,195,0,
71,149,180,37,190,150,0,71,149,54,180,37,190,54,0,84,149,30,
37,150,90,150,0,13,178,54,13,163,30,195,150,0,84,149,30,37,
150,90,0,150,71,105,90,84,37,103,103,0,30,13,150,90,37,13,
222,0,178,37,184,184,222,8,69,37,222,150,0,195,54,184,184,103,
30,0,184,37,13,54,105,54,30,149,0,150,247,13,163,103,30,69,
0,69,149,37,195,163,0,190,37,195,178,71,71,69,0,90,178,30,
8,84,54,180,8,71,195,30,0,84,54,71,90,13,178,0,195,54,
13,30,8,180,247,222,0,84,54,37,90,13,178,0,195,54,13,30,
8,180,247,222,0,84,247,195,180,0,195,71,150,30,0,13,178,54,
180,37,84,71,71,0,195,54,13,30,8,180,247,222,0,13,178,54,
195,13,0,195,54,13,30,8,180,247,222,0,13,178,54,195,163,0,
195,54,13,30,8,180,247,222,0,13,103,54,90,0,84,54,180,8,
90,71,30,0,13,71,13,163,0,84,54,180,8,90,71,30,0,13,
247,190,0,150,184,37,190,0,150,247,13,163,0,84,149,30,37,90,
178,30,8,90,178,149,71,247,180,178,8,37,8,150,90,149,37,200,
0,13,247,195,195,54,103,54,195,180,247,150,0,195,71,150,30,8,
149,247,84,84,54,195,180,0,13,247,195,90,0,195,71,150,90,149,
54,103,0,69,54,13,163,0,84,54,180,8,90,71,30,0,69,54,
103,69,71,0,90,71,37,150,90,30,149,0,69,222,163,30,0,195,
54,13,30,8,180,247,222,0,105,37,180,0,195,54,13,30,8,180,
247,222,0,105,30,103,103,37,90,54,71,0,163,54,150,150,54,195,
180,0,180,71,69,69,0,180,71,103,103,222,0,239,37,13,163,71,
0,195,54,13,30,8,180,247,222,0,239,30,149,163,71,0,195,54,
13,30,8,180,247,222,0,239,30,149,163,0,195,54,13,30,8,180,
247,222,0,239,54,180,37,84,71,71,0,195,54,13,30,8,180,247,
222,0,239,54,150,190,0,150,184,37,190,0,239,54,43,0,150,184,
37,190,0,163,54,163,30,0,195,54,13,30,8,180,247,222,0,163,
103,54,90,0,195,71,150,90,149,54,103,0,163,247,195,90,0,195,
54,13,30,8,180,247,222,0,163,222,163,30,0,195,54,13,30,8,
180,247,222,0,103,30,150,84,71,0,195,54,13,30,8,180,247,222,
0,103,30,43,84,71,0,195,54,13,30,8,180,247,222,0,195,54,
180,180,0,195,54,13,30,8,180,247,222,0,184,30,195,54,150,0,
84,54,180,8,90,71,30,0,184,54,150,150,0,184,247,163,30,0,
184,178,247,0,150,178,71,205,30,0,184,149,54,13,163,0,195,54,
13,30,8,180,247,222,0,150,195,37,90,13,178,0,195,71,150,90,
149,54,103,0,150,184,30,149,190,0,239,30,103,103,71,0,150,184,
54,13,0,195,54,13,30,8,180,247,222,0,90,54,90,0,150,184,
37,149,149,71,200,0,90,200,37,90,0,195,71,150,90,149,54,103,
0,205,37,180,54,195,0,195,71,150,90,149,54,103,0,200,71,184,
0,195,54,13,30,8,180,247,222,0,37,149,150,13,178,0,69,71,
195,163,30,222,0,84,103,37,150,30,195,0,180,30,149,190,37,195,
8,200,71,149,69,0,84,247,190,150,30,195,0,180,30,149,190,37,
195,8,200,71,149,69,0,105,54,13,163,0,150,178,71,205,30,0,
105,54,13,163,30,195,0,180,30,149,190,37,195,8,200,71,149,69,
0,105,54,13,163,30,149,0,180,30,149,190,37,195,8,200,71,149,
69,0,105,247,90,43,30,0,180,30,149,190,37,195,8,200,71,149,
69,0,180,71,90,90,205,30,149,69,37,190,190,90,0,180,71,103,
103,222,0,178,54,90,103,30,149,0,150,190,247,149,105,0,178,247,
149,30,0,195,54,13,30,8,180,247,222,0,178,247,149,30,150,71,
178,195,0,195,54,13,30,8,180,247,222,0,54,90,37,163,30,149,
0,180,30,149,190,37,195,8,200,71,149,69,0,163,37,105,105,30,
149,0,180,30,149,190,37,195,8,200,71,149,69,0,163,103,54,90,
71,149,54,150,0,195,71,150,90,149,54,103,0,103,30,150,84,30,
0,195,54,13,30,8,180,247,222,0,195,37,43,54,0,195,54,13,
30,8,180,247,222,0,195,247,90,90,30,0,180,30,149,190,37,195,
8,200,71,149,69,0,184,54,30,190,30,103,0,180,30,149,190,37,
195,8,200,71,149,69,0,184,54,190,190,30,103,0,180,30,149,190,
37,195,8,200,71,149,69,0,184,54,150,150,30,0,184,247,163,30,
0,184,54,150,150,30,195,0,184,247,163,54,195,180,0,184,54,150,
150,30,149,0,195,54,13,30,8,180,247,222,0,184,71,103,37,163,
30,0,195,54,13,30,8,180,247,222,0,150,13,178,103,37,190,184,
30,0,180,30,149,190,37,195,8,200,71,149,69,0,150,13,178,200,
37,195,43,0,84,54,180,8,90,71,30,0,150,13,178,200,30,54,
195,0,195,54,13,30,8,180,247,222,0,150,13,178,200,247,13,178,
90,30,103,0,180,30,149,190,37,195,8,200,71,149,69,0,150,13,
178,200,30,103,0,180,30,149,190,37,195,8,200,71,149,69,0,150,
13,178,200,247,103,54,195,0,180,30,149,190,37,195,8,200,71,149,
69,0,150,184,30,149,190,37,0,150,184,37,190,0,90,54,90,90,
30,0,150,184,37,149,149,71,200,0,200,54,13,178,150,30,0,180,
30,149,190,37,195,8,200,71,149,69,0,
};
  
CleanLanguage::CleanLanguage(void)
{
  mTranslate = new Translate(13); // build ascii translation table
  // Place contents of database into an STL vector of strings.
  const unsigned char *foo = data;
    while ( *foo )
  {
    String str = (char *)foo;
    mWords.push_back(str);
    foo+=str.size()+1; // advance to next string.
  }
    // get translated wildcard character.
  mWildChar = mTranslate->TranslateIn( (unsigned char) '*' );
}
  CleanLanguage::~CleanLanguage(void)
{
  delete mTranslate;
}
  // True if alphabetic character
bool CleanLanguage::IsAlpha(char c)
{
  if ( c >= 'a' && c <= 'z' ) return true;
  if ( c >= 'A' && c <= 'Z' ) return true;
  return false;
}
  // Add a character to cleaned up output.
bool CleanLanguage::Add(char c,char *clean,int &index,int maxlen)
{
  if ( index < (maxlen-1) )
  {
    clean[index] = c;
    index++;
    clean[index] = 0;
    return true;
  }
  return false;
}
  // will analyze an input string for foul language, and will replace
// objectionable language with non-offending ascii text in the ouput
// string.  Returns count of the number of offending words found.
int CleanLanguage::WashYourMouth(const String &input,String &output) const
{
  int l = input.size();
  if ( !l ) return 0;
    int dirtycount = 0;
    int maxlen = l*8; // maximum size output version could ever be.
  char *clean       = new char[maxlen];
  char *clean_alloc = clean;
    int wordlen = maxlen;
  if ( wordlen < 256 ) wordlen = 256;
    char *word        = new char[wordlen];
    *clean = 0;
    int i = 0;
    const char *dirty = input.c_str();
  	while ( *dirty )
	{
      while ( !IsAlpha(*dirty) && *dirty )
    {
      Add(*dirty,clean,i,maxlen); // add non-alphabetic characters.
      dirty++;
    }
      char *wordat = word;
      while ( IsAlpha(*dirty) && *dirty )
    {
      *wordat++ = *dirty++;
    }
      *wordat = 0;
      // grab sequence of alphabetic characters and then analyze them.
    String wordin = word;
    String wordout;
      int match = CheckWord(wordin,wordout);
    if ( match ) dirtycount++;
      const char *foo = wordout.c_str();
      while ( *foo )
    {
      Add(*foo++,clean,i,maxlen);
    }
  	}
    output = clean_alloc;
    delete word;
  delete clean_alloc;
    return dirtycount;
}
  
// See if the word is 'bad' and return a cleaned up version in good.
// Returns percentage match if any.
int CleanLanguage::CheckWord(const String &bad,String &good) const
{
  int size = bad.size();
    if ( size >= 255 ) // we don't process words more than 255 characters long
  {
    good = "too long";
    return 100; // replaced!
  };
  
  int percent = 0;  // best matching percentage.
  char temp[256];  // copy potential bad string and convert to new alphabet
  char match[256]; // translated result
  strcpy(temp,bad.c_str());
    bool uppercase = false;
  if ( *temp >= 'A' && *temp <= 'Z' ) uppercase = true;
  strlwr(temp);
    mTranslate->TranslateIn((unsigned char *)temp); // change alphabet
  StringVector::const_iterator i = mWords.begin();
    while ( i != mWords.end() )
  {
      const unsigned char *word = (const unsigned char *)(*i).c_str();
      ++i;
      if ( *word == mWildChar ) // process as a wildcard!
    {
      if ( !percent && WildCard((const char *)&word[1],temp) )
      {
        percent = 100; // perfect match to wildcard!!
        strcpy(match,(*i).c_str());
      }
    }
    else
    {
      // Perform fuzzy compare between the two strings.
      int alike = Gestalt::FuzzyCompare(temp,(const char *)word);
      // Heuristic based on length of word and percentage match
  	  if ( ( size <= 4  && alike > 99 ) ||
  		  	 ( size >= 5  && size < 7 && alike > 85 ) ||
  			   ( size >= 7  && size < 10 && alike > 80 ) ||
    			 ( size >= 10 && alike > 75 ) )
  	  {
        if ( alike > percent ) // if best match so far.
        {
          strcpy(match,(*i).c_str());
      	  percent = alike;
        }
      }
    }
    ++i;
  }
    if ( percent )
  {
    mTranslate->TranslateOut((unsigned char *)match);
    if ( uppercase ) match[0]-=32;
    good = match;
  }
  else
  {
    good = bad; // nothing wrong with it.
  }
    return percent;
}
  // checks to see if the sequence of characters in 'wild' are to be found anywhere
// inside 'match'
bool CleanLanguage::WildCard(const char *wild,const char *match)
{
  int w = strlen(wild);
  int m = strlen(match);
    while ( w <= m )
  {
    if ( strncmp(wild,match,w) == 0 ) return true;
    match++;
    m--;
  }
    return false;
}
   |