/////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2009-2014 Alan Wright. All rights reserved.
// Distributable under the terms of either the Apache License (Version 2.0)
// or the GNU Lesser General Public License.
/////////////////////////////////////////////////////////////////////////////

#include "ContribInc.h"
#include "CzechAnalyzer.h"
#include "StandardTokenizer.h"
#include "StandardFilter.h"
#include "LowerCaseFilter.h"
#include "StopFilter.h"
#include "StringUtils.h"

namespace Lucene {

/// Default Czech stopwords in UTF-8 format.
const uint8_t CzechAnalyzer::_CZECH_STOP_WORDS[] = {
    0x61, 0x0a, 0x73, 0x0a, 0x6b, 0x0a, 0x6f, 0x0a, 0x69, 0x0a, 0x75, 0x0a, 0x76, 0x0a, 0x7a, 0x0a,
    0x64, 0x6e, 0x65, 0x73, 0x0a, 0x63, 0x7a, 0x0a, 0x74, 0xc3, 0xad, 0x6d, 0x74, 0x6f, 0x0a, 0x62,
    0x75, 0x64, 0x65, 0xc5, 0xa1, 0x0a, 0x62, 0x75, 0x64, 0x65, 0x6d, 0x0a, 0x62, 0x79, 0x6c, 0x69,
    0x0a, 0x6a, 0x73, 0x65, 0xc5, 0xa1, 0x0a, 0x6d, 0x75, 0x6a, 0x0a, 0x73, 0x76, 0xc3, 0xbd, 0x6d,
    0x0a, 0x74, 0x61, 0x0a, 0x74, 0x6f, 0x6d, 0x74, 0x6f, 0x0a, 0x74, 0x6f, 0x68, 0x6c, 0x65, 0x0a,
    0x74, 0x75, 0x74, 0x6f, 0x0a, 0x74, 0x79, 0x74, 0x6f, 0x0a, 0x6a, 0x65, 0x6a, 0x0a, 0x7a, 0x64,
    0x61, 0x0a, 0x70, 0x72, 0x6f, 0x63, 0x0a, 0x6d, 0xc3, 0xa1, 0x74, 0x65, 0x0a, 0x74, 0x61, 0x74,
    0x6f, 0x0a, 0x6b, 0x61, 0x6d, 0x0a, 0x74, 0x6f, 0x68, 0x6f, 0x74, 0x6f, 0x0a, 0x6b, 0x64, 0x6f,
    0x0a, 0x6b, 0x74, 0x65, 0x72, 0xc3, 0xad, 0x0a, 0x6d, 0x69, 0x0a, 0x6e, 0xc3, 0xa1, 0x6d, 0x0a,
    0x74, 0x6f, 0x6d, 0x0a, 0x74, 0x6f, 0x6d, 0x75, 0x74, 0x6f, 0x0a, 0x6d, 0xc3, 0xad, 0x74, 0x0a,
    0x6e, 0x69, 0x63, 0x0a, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x0a, 0x6b, 0x74, 0x65, 0x72, 0x6f, 0x75,
    0x0a, 0x62, 0x79, 0x6c, 0x61, 0x0a, 0x74, 0x6f, 0x68, 0x6f, 0x0a, 0x70, 0x72, 0x6f, 0x74, 0x6f,
    0xc5, 0xbe, 0x65, 0x0a, 0x61, 0x73, 0x69, 0x0a, 0x68, 0x6f, 0x0a, 0x6e, 0x61, 0xc5, 0xa1, 0x69,
    0x0a, 0x6e, 0x61, 0x70, 0x69, 0xc5, 0xa1, 0x74, 0x65, 0x0a, 0x72, 0x65, 0x0a, 0x63, 0x6f, 0xc5,
    0xbe, 0x0a, 0x74, 0xc3, 0xad, 0x6d, 0x0a, 0x74, 0x61, 0x6b, 0xc5, 0xbe, 0x65, 0x0a, 0x73, 0x76,
    0xc3, 0xbd, 0x63, 0x68, 0x0a, 0x6a, 0x65, 0x6a, 0xc3, 0xad, 0x0a, 0x73, 0x76, 0xc3, 0xbd, 0x6d,
    0x69, 0x0a, 0x6a, 0x73, 0x74, 0x65, 0x0a, 0x61, 0x6a, 0x0a, 0x74, 0x75, 0x0a, 0x74, 0x65, 0x64,
    0x79, 0x0a, 0x74, 0x65, 0x74, 0x6f, 0x0a, 0x62, 0x79, 0x6c, 0x6f, 0x0a, 0x6b, 0x64, 0x65, 0x0a,
    0x6b, 0x65, 0x0a, 0x70, 0x72, 0x61, 0x76, 0xc3, 0xa9, 0x0a, 0x6a, 0x69, 0x0a, 0x6e, 0x61, 0x64,
    0x0a, 0x6e, 0x65, 0x6a, 0x73, 0x6f, 0x75, 0x0a, 0x63, 0x69, 0x0a, 0x70, 0x6f, 0x64, 0x0a, 0x74,
    0xc3, 0xa9, 0x6d, 0x61, 0x0a, 0x6d, 0x65, 0x7a, 0x69, 0x0a, 0x70, 0x72, 0x65, 0x73, 0x0a, 0x74,
    0x79, 0x0a, 0x70, 0x61, 0x6b, 0x0a, 0x76, 0xc3, 0xa1, 0x6d, 0x0a, 0x61, 0x6e, 0x69, 0x0a, 0x6b,
    0x64, 0x79, 0xc5, 0xbe, 0x0a, 0x76, 0xc5, 0xa1, 0x61, 0x6b, 0x0a, 0x6e, 0x65, 0x67, 0x0a, 0x6a,
    0x73, 0x65, 0x6d, 0x0a, 0x74, 0x65, 0x6e, 0x74, 0x6f, 0x0a, 0x63, 0x6c, 0xc3, 0xa1, 0x6e, 0x6b,
    0x75, 0x0a, 0x63, 0x6c, 0xc3, 0xa1, 0x6e, 0x6b, 0x79, 0x0a, 0x61, 0x62, 0x79, 0x0a, 0x6a, 0x73,
    0x6d, 0x65, 0x0a, 0x70, 0x72, 0x65, 0x64, 0x0a, 0x70, 0x74, 0x61, 0x0a, 0x6a, 0x65, 0x6a, 0x69,
    0x63, 0x68, 0x0a, 0x62, 0x79, 0x6c, 0x0a, 0x6a, 0x65, 0xc5, 0xa1, 0x74, 0x65, 0x0a, 0x61, 0xc5,
    0xbe, 0x0a, 0x62, 0x65, 0x7a, 0x0a, 0x74, 0x61, 0x6b, 0xc3, 0xa9, 0x0a, 0x70, 0x6f, 0x75, 0x7a,
    0x65, 0x0a, 0x70, 0x72, 0x76, 0x6e, 0xc3, 0xad, 0x0a, 0x76, 0x61, 0xc5, 0xa1, 0x65, 0x0a, 0x6b,
    0x74, 0x65, 0x72, 0xc3, 0xa1, 0x0a, 0x6e, 0xc3, 0xa1, 0x73, 0x0a, 0x6e, 0x6f, 0x76, 0xc3, 0xbd,
    0x0a, 0x74, 0x69, 0x70, 0x79, 0x0a, 0x70, 0x6f, 0x6b, 0x75, 0x64, 0x0a, 0x6d, 0x75, 0xc5, 0xbe,
    0x65, 0x0a, 0x73, 0x74, 0x72, 0x61, 0x6e, 0x61, 0x0a, 0x6a, 0x65, 0x68, 0x6f, 0x0a, 0x73, 0x76,
    0xc3, 0xa9, 0x0a, 0x6a, 0x69, 0x6e, 0xc3, 0xa9, 0x0a, 0x7a, 0x70, 0x72, 0xc3, 0xa1, 0x76, 0x79,
    0x0a, 0x6e, 0x6f, 0x76, 0xc3, 0xa9, 0x0a, 0x6e, 0x65, 0x6e, 0xc3, 0xad, 0x0a, 0x76, 0xc3, 0xa1,
    0x73, 0x0a, 0x6a, 0x65, 0x6e, 0x0a, 0x70, 0x6f, 0x64, 0x6c, 0x65, 0x0a, 0x7a, 0x64, 0x65, 0x0a,
    0x75, 0xc5, 0xbe, 0x0a, 0x62, 0xc3, 0xbd, 0x74, 0x0a, 0x76, 0xc3, 0xad, 0x63, 0x65, 0x0a, 0x62,
    0x75, 0x64, 0x65, 0x0a, 0x6a, 0x69, 0xc5, 0xbe, 0x0a, 0x6e, 0x65, 0xc5, 0xbe, 0x0a, 0x6b, 0x74,
    0x65, 0x72, 0xc3, 0xbd, 0x0a, 0x62, 0x79, 0x0a, 0x6b, 0x74, 0x65, 0x72, 0xc3, 0xa9, 0x0a, 0x63,
    0x6f, 0x0a, 0x6e, 0x65, 0x62, 0x6f, 0x0a, 0x74, 0x65, 0x6e, 0x0a, 0x74, 0x61, 0x6b, 0x0a, 0x6d,
    0xc3, 0xa1, 0x0a, 0x70, 0x72, 0x69, 0x0a, 0x6f, 0x64, 0x0a, 0x70, 0x6f, 0x0a, 0x6a, 0x73, 0x6f,
    0x75, 0x0a, 0x6a, 0x61, 0x6b, 0x0a, 0x64, 0x61, 0x6c, 0xc5, 0xa1, 0xc3, 0xad, 0x0a, 0x61, 0x6c,
    0x65, 0x0a, 0x73, 0x69, 0x0a, 0x73, 0x65, 0x0a, 0x76, 0x65, 0x0a, 0x74, 0x6f, 0x0a, 0x6a, 0x61,
    0x6b, 0x6f, 0x0a, 0x7a, 0x61, 0x0a, 0x7a, 0x70, 0x65, 0x74, 0x0a, 0x7a, 0x65, 0x0a, 0x64, 0x6f,
    0x0a, 0x70, 0x72, 0x6f, 0x0a, 0x6a, 0x65, 0x0a, 0x6e, 0x61, 0x0a, 0x61, 0x74, 0x64, 0x0a, 0x61,
    0x74, 0x70, 0x0a, 0x6a, 0x61, 0x6b, 0x6d, 0x69, 0x6c, 0x65, 0x0a, 0x70, 0x72, 0x69, 0x63, 0x65,
    0x6d, 0xc5, 0xbe, 0x0a, 0x6a, 0xc3, 0xa1, 0x0a, 0x6f, 0x6e, 0x0a, 0x6f, 0x6e, 0x61, 0x0a, 0x6f,
    0x6e, 0x6f, 0x0a, 0x6f, 0x6e, 0x69, 0x0a, 0x6f, 0x6e, 0x79, 0x0a, 0x6d, 0x79, 0x0a, 0x76, 0x79,
    0x0a, 0x6a, 0xc3, 0xad, 0x0a, 0x6a, 0x69, 0x0a, 0x6d, 0x65, 0x0a, 0x6d, 0x6e, 0x65, 0x0a, 0x6a,
    0x65, 0x6d, 0x75, 0x0a, 0x74, 0x6f, 0x6d, 0x75, 0x0a, 0x74, 0x65, 0x6d, 0x0a, 0x74, 0x65, 0x6d,
    0x75, 0x0a, 0x6e, 0x65, 0x6d, 0x75, 0x0a, 0x6e, 0x65, 0x6d, 0x75, 0xc5, 0xbe, 0x0a, 0x6a, 0x65,
    0x68, 0x6f, 0xc5, 0xbe, 0x0a, 0x6a, 0xc3, 0xad, 0xc5, 0xbe, 0x0a, 0x6a, 0x65, 0x6c, 0x69, 0x6b,
    0x6f, 0xc5, 0xbe, 0x0a, 0x6a, 0x65, 0xc5, 0xbe, 0x0a, 0x6a, 0x61, 0x6b, 0x6f, 0xc5, 0xbe, 0x0a,
    0x6e, 0x61, 0x63, 0x65, 0xc5, 0xbe, 0x0a
};

CzechAnalyzer::CzechAnalyzer(LuceneVersion::Version matchVersion) {
    this->stoptable = getDefaultStopSet();
    this->matchVersion = matchVersion;
}

CzechAnalyzer::CzechAnalyzer(LuceneVersion::Version matchVersion, HashSet<String> stopwords) {
    this->stoptable = stopwords;
    this->matchVersion = matchVersion;
}

CzechAnalyzer::~CzechAnalyzer() {
}

const HashSet<String> CzechAnalyzer::getDefaultStopSet() {
    static HashSet<String> stopSet;
    LUCENE_RUN_ONCE(
        String stopWords(UTF8_TO_STRING(_CZECH_STOP_WORDS));
        Collection<String> words(StringUtils::split(stopWords, L"\n"));
        stopSet = HashSet<String>::newInstance(words.begin(), words.end());
    );
    return stopSet;
}

TokenStreamPtr CzechAnalyzer::tokenStream(const String& fieldName, const ReaderPtr& reader) {
    TokenStreamPtr result = newLucene<StandardTokenizer>(matchVersion, reader);
    result = newLucene<LowerCaseFilter>(result);
    result = newLucene<StandardFilter>(result);
    result = newLucene<StopFilter>(StopFilter::getEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable);
    return result;
}

TokenStreamPtr CzechAnalyzer::reusableTokenStream(const String& fieldName, const ReaderPtr& reader) {
    CzechAnalyzerSavedStreamsPtr streams(boost::dynamic_pointer_cast<CzechAnalyzerSavedStreams>(getPreviousTokenStream()));
    if (!streams) {
        streams = newLucene<CzechAnalyzerSavedStreams>();
        streams->source = newLucene<StandardTokenizer>(matchVersion, reader);
        streams->result = newLucene<StandardFilter>(streams->source);
        streams->result = newLucene<LowerCaseFilter>(streams->result);
        streams->result = newLucene<StopFilter>(StopFilter::getEnablePositionIncrementsVersionDefault(matchVersion), streams->result, stoptable);
        setPreviousTokenStream(streams);
    } else {
        streams->source->reset(reader);
    }
    return streams->result;
}

CzechAnalyzerSavedStreams::~CzechAnalyzerSavedStreams() {
}

}
