public static void Main(string[] args) { string inputDicts = "/u/nlp/data/chinese-dictionaries/plain/ne_wikipedia-utf8.txt,/u/nlp/data/chinese-dictionaries/plain/newsexplorer_entities_utf8.txt,/u/nlp/data/chinese-dictionaries/plain/Ch-name-list-utf8.txt,/u/nlp/data/chinese-dictionaries/plain/wikilex-20070908-zh-en.txt,/u/nlp/data/chinese-dictionaries/plain/adso-1.25-050405-monolingual-clean.utf8.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_108k_normalized.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_mandarintools_normalized.txt,/u/nlp/data/chinese-dictionaries/plain/harbin-ChineseNames_utf8.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_HowNet_normalized.txt"; string output = "/u/nlp/data/gale/segtool/stanford-seg/classifiers/dict-chris6.ser.gz"; IDictionary <string, int> flagMap = Generics.NewHashMap(); flagMap["-inputDicts"] = 1; flagMap["-output"] = 1; IDictionary <string, string[]> argsMap = StringUtils.ArgsToMap(args, flagMap); // args = argsMap.get(null); if (argsMap.Keys.Contains("-inputDicts")) { inputDicts = argsMap["-inputDicts"][0]; } if (argsMap.Keys.Contains("-output")) { output = argsMap["-output"][0]; } string[] dicts = inputDicts.Split(","); ChineseDocumentToSentenceProcessor cdtos = new ChineseDocumentToSentenceProcessor(null); bool expandMidDot = true; Edu.Stanford.Nlp.Wordseg.ChineseDictionary dict = new Edu.Stanford.Nlp.Wordseg.ChineseDictionary(dicts, cdtos, expandMidDot); dict.SerializeDictionary(output); }
public virtual void Init(SeqClassifierFlags flags) { this.flags = flags; factory = LineIterator.GetFactory(new Sighan2005DocumentReaderAndWriter.CTBDocumentParser(this)); // pichuan : flags.normalizationTable is null --> i believe this is replaced by some java class?? // (Thu Apr 24 11:10:42 2008) cdtos = new ChineseDocumentToSentenceProcessor(flags.normalizationTable); if (flags.dictionary != null) { string[] dicts = flags.dictionary.Split(","); cdict = new ChineseDictionary(dicts, cdtos, flags.expandMidDot); } if (flags.serializedDictionary != null) { string dict = flags.serializedDictionary; cdict = new ChineseDictionary(dict, cdtos, flags.expandMidDot); } if (flags.dictionary2 != null) { string[] dicts2 = flags.dictionary2.Split(","); cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot); } }
public ChineseDictionary(string[] dicts, ChineseDocumentToSentenceProcessor cdtos, bool expandMidDot) { logger.Info(string.Format("Loading Chinese dictionaries from %d file%s:%n", dicts.Length, (dicts.Length == 1) ? string.Empty : "s")); foreach (string dict in dicts) { logger.Info(" " + dict); } for (int i = 0; i <= MaxLexiconLength; i++) { words_[i] = Generics.NewHashSet(); } this.cdtos_ = cdtos; foreach (string dict_1 in dicts) { if (dict_1.EndsWith("ser.gz")) { // TODO: the way this is written does not work if we allow dictionaries to have different settings of MAX_LEXICON_LENGTH ICollection <string>[] dictwords = LoadDictionary(dict_1); for (int i_1 = 0; i_1 <= MaxLexiconLength; i_1++) { Sharpen.Collections.AddAll(words_[i_1], dictwords[i_1]); dictwords[i_1] = null; } } else { AddDict(dict_1, expandMidDot); } } int total = 0; for (int i_2 = 0; i_2 <= MaxLexiconLength; i_2++) { total += words_[i_2].Count; } logger.Info(string.Format("Done. Unique words in ChineseDictionary is: %d.%n", total)); }
/// <summary> /// The first argument can be one file path, or multiple files separated by /// commas. /// </summary> public ChineseDictionary(string serDicts, ChineseDocumentToSentenceProcessor cdtos, bool expandMidDot) : this(serDicts.Split(","), cdtos, expandMidDot) { }
public ChineseDictionary(string[] dicts, ChineseDocumentToSentenceProcessor cdtos) : this(dicts, cdtos, false) { }