Logo Search packages:      
Sourcecode: kat version File versions  Download package

katlanguagemanager.h

/***************************************************************************
 *   Copyright (C) 2005 by Roberto Cappuccio and the Kat team              *
 *   Roberto Cappuccio : roberto.cappuccio@gmail.com                       *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA.           *
 ***************************************************************************/

#ifndef _KATLANGUAGEMANAGER_H_
#define _KATLANGUAGEMANAGER_H_

#include <qstring.h>
#include <qmap.h>
#include <qfile.h>
#include <qstringlist.h>
#include <qptrlist.h>
#include "kat_export.h"

// The maximum score is equal to the maximum integer in the system
#define MAXSCORE INT_MAX

// Maximum number of n-grams in a fingerprint
#define MAXNGRAMS 400

// Maximum size of an n-gram
#define MAXNGRAMSIZE 5

// Minimum size (in characters) for accepting a document
#define MINDOCSIZE 25

// Maximum penalty for missing an n-gram in fingerprint
#define MAXOUTOFPLACE 401

// Maximum size (in characters) of the document buffer
#define MAXDOCSIZE 5000

// Reported matches are those fingerprints with a score less than best
// score * THRESHOLDVALUE (i.e. a THRESHOLDVALUE of 1.03 means matches
// must score within 3% from the best score)
#define THRESHOLDVALUE  1.03

// If more than MAXCANDIDATES matches are found, the classifier reports
// unknown, because the input is obviously confusing
#define MAXCANDIDATES 5

class LanguageProfile : public QMap<QString, long>
{
public:
    LanguageProfile() : QMap<QString, long>() {};
    ~LanguageProfile() {};
};

typedef QMap<QString,LanguageProfile> LanguageProfileMap;

class NGram
{
public:
    NGram( const QString &n, long o ) : ngram( n ), occurrences( o ) {};
    ~NGram() {};

    QString ngram;
    long occurrences;
};

class NGramsList : public QPtrList<NGram>
{
public:
    NGramsList() : QPtrList<NGram>() {};
    ~NGramsList() {};

protected:
    virtual int compareItems ( QCollection::Item, QCollection::Item );
};

class Language
{
public:
    Language( const QString &l, long d ) : language(l), distance(d) {};
    ~Language() {};

    QString language;
    long distance;
};

class LanguageList : public QPtrList<Language>
{
public:
    LanguageList() : QPtrList<Language>() {};
    ~LanguageList() {};

protected:
    virtual int compareItems ( QCollection::Item, QCollection::Item );
};

class KAT_EXPORT KatLanguageManager
{
public:
    KatLanguageManager();
    ~KatLanguageManager();

    static QMap<QString,LanguageProfile>* loadAllLanguageProfiles();
    static void extractNGrams( const QString&, QStringList&, int );
    static NGramsList createFingerprintFromFile( const QString& );
    static NGramsList createFingerprintFromQString( const QString& );
    static QString identifyLanguage( const QString&, QMap<QString,LanguageProfile> lp );
    static long calculateDistance( NGramsList&, LanguageProfile& );
};

#endif // _KATLANGUAGEMANAGER_H_

Generated by  Doxygen 1.6.0   Back to index