public void addDocumentTest() { CorpusTest(); var res = corpus.AddDocument("3.txt"); res.Wait(); Console.WriteLine(res.Result); var a = new TermDocument("stop"); a.addDoc(new DocumentReference("", "ped", new HashSet <int>(new int[] { 1 }))); var b = new TermDocument("Learn"); b.addDoc(new DocumentReference("", "ing", new HashSet <int>(new int[] { 2 }))); res = corpus.AddDocument("4.txt", 2, new List <ITermDocument>(new TermDocument[] { a, b })); res.Wait(); Console.WriteLine(res.Result); }
private Corpus ExtractCorpus(IDataView input) { var result = new Corpus(); var inputColumn = input.Schema[_inputColumnName]; var keyValues = new VBuffer <ReadOnlyMemory <char> >(); inputColumn.Annotations.GetValue("KeyValues", ref keyValues); result.Vocabulary = keyValues.GetValues().ToArray(); VBuffer <uint> tokensVector = default; using (var cursor = input.GetRowCursor(new[] { inputColumn })) { while (cursor.MoveNext()) { var cursorVectorsGetter = cursor.GetGetter <VBuffer <uint> >(inputColumn); cursorVectorsGetter(ref tokensVector); var words = tokensVector.GetValues().ToArray(); result.AddDocument(words); } } return(result); }
static void ProcessCommand(string command) { string[] blocks = command.Split(); switch (blocks[0].ToLower()) { case "": return; case "exit": if (blocks.Length == 1) _exitFlag = true; else Console.WriteLine("Unknown Parameters."); return; case "clear": if (blocks.Length == 1) Console.Clear(); else Console.WriteLine("Unknown Parameters."); return; case "genas": AnnotationDictionary dict = new AnnotationDictionary(); AnnotationSet ans = new AnnotationSet("WEMAS"); dict.Load("WEMAS.xml"); foreach (var entry in dict._Dictionary) { ans.SetAnnotationDescription(entry.Key, entry.Value); } foreach (var sep in dict.SentenceSeparators) { ans.AddSentenceSeparator(sep); } ans.Save("WEMAS.wemas"); return; case "testas": AnnotationSet ans2 = new AnnotationSet("WEMAS"); ans2.Load("WEMAS.wemas"); ans2.Description = "WEB ENTITY MINER Default Annotation Set."; ans2.Save("WEMAS2.wemas"); return; case "testws": AnnotationSet wemas = new AnnotationSet(); wemas.Load("WEMAS.wemas"); ICTCLASAnnotator ano = new ICTCLASAnnotator(wemas, null); CRFPPAnnotator crfAno = new CRFPPAnnotator(wemas, new Model( AppDomain.CurrentDomain.BaseDirectory + "model.crfppmodel")); Corpus c = new Corpus(Encoding.UTF8); SentenceFactory.AnnotationSet = wemas; SentenceFactory.InputLanguage = Language.SimplifiedChinese; SentenceFactory.OutputEncoding = Encoding.UTF8; var sens = SentenceFactory.GetSentences( new FileStream("utf8.txt", FileMode.Open, FileAccess.Read)); WEMDocument doc = new WEMDocument(Encoding.UTF8); foreach (var sen in sens) { doc.AddSentence(sen); } c.AddDocument(doc); try { ano.ProcessCorpus(c); crfAno.ProcessCorpus(c); } catch (Exception ex) { Console.WriteLine("Unhandled Exception:\n{0}", ex.Message); } doc.Save("RESULT.xml"); foreach (var sententce in doc.Sentences) { foreach (var word in sententce.Words) { if (word is Entity) { Console.WriteLine("{0}/ENTITY:{1}", word.Content, wemas[((Entity)word).EntityId]); } else { Console.WriteLine("{0}", word.Content); } } } return; default: Console.WriteLine("Unknown Command: '{0}'.", blocks[0]); return; } }