static void Main(string[] args) { String[] files = Directory.GetFiles(@"C:\10311173\lab-5-opennlp-tokenization-Changyiyu\Dataset", "*.html"); StreamWriter sw = new StreamWriter(@"TokensReader.txt"); foreach (string thefile in files) { using (StreamReader sr = new StreamReader(thefile)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); string[] tokens; InputStream modelIn = new FileInputStream(@"C:\10311173\lab-6-opennlp-ju-zi-qie-fen-Changyiyu\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(line); for (int i = 0; i < tokens.Length; i++) { //Regex reg = new Regex("."); //Match m = reg.Match(tokens[i]); sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\r\n"); } } } } } sw.Close(); }
public void TestTokenizer() { var model = TokenizerTestUtil.CreateMaxentTokenModel(); var tokenizer = new TokenizerME(model); TestTokenizer(tokenizer); }
private StringBuilder ReverseIt(string Message) { StringBuilder reversedString = new StringBuilder(); //lets do some language processing tasks to identify sentence structure SentenceDetectorME sentenceParser = new SentenceDetectorME(LoadNLP.sentenceModel); TokenizerME tokenizer = new TokenizerME(LoadNLP.tokenModel); string[] sentences = sentenceParser.sentDetect(Message); //iterate through each sentence foreach (string sentence in sentences) { string[] tokens = tokenizer.tokenize(sentence); //reverse the tokens for (int i = 0; i < tokens.Length / 2; i++) { string storage = tokens[i]; tokens[i] = tokens[tokens.Length - i - 1]; tokens[tokens.Length - i - 1] = storage; } //Now that we've organized the sentence nicely, lets detokenize it and convert back to a usable string reversedString.Append(DeTokenize(tokens, DetokenizationDictionary.Operation.MOVE_LEFT)); } return(reversedString); }
public ILexer InitNow() { Console.WriteLine("Loading..."); _tokenizer = prepareTokenizer(); _posTagger = preparePOSTagger(); return this; }
static void Main(string[] args) { String[] files = Directory.GetFiles(@"..\..\..\..\Dataset", "lab5result.txt"); StreamWriter sw = new StreamWriter(@"..\..\..\..\lab6result.txt", false); foreach (string file in files) { using (StreamReader sr = new StreamReader(file)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); string[] tokens; InputStream modelIn = new FileInputStream(@"..\..\..\..\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(line); for (int i = 0; i < tokens.Length; i++) { sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\r\n"); } } } } } sw.Close(); }
static void Main(string[] args) { String[] file = Directory.GetFiles(@"C:\10311171\lab-6-opennlp-ju-zi-qie-fen-HUNGLIWEN\lab6\Dataset", "*.html"); StreamWriter sw = new StreamWriter(@"ReadByTokens.txt"); foreach (string filename in file) { using (StreamReader sr = new StreamReader(filename)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); string[] tokens; InputStream modelIn = new FileInputStream(@"C:\10311171\lab-6-opennlp-ju-zi-qie-fen-HUNGLIWEN\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(line); for (int i = 0; i < tokens.Length; i++) { sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\n"); } } } } } sw.Close(); }
/// <summary> /// Split the input content to individual words /// </summary> /// <param name="contents">Content to split into words</param> /// <returns></returns> public static IEnumerable<string> TokenizeNow(string contents) { //ToDo: Make preprocessing function a functional pointer var processedContents = PreProcessing(contents); var tokenizer = new TokenizerME(Model); var tokens = tokenizer.tokenize(processedContents); return tokens; }
public ILexer InitNow() { Console.WriteLine("Loading..."); _tokenizer = prepareTokenizer(); _nameFinder = prepareNameFinder(); _locationFinder = prepareLocationFinder(); _timeFinder = prepareTimeFinder(); return this; }
string[] Tokenizer(string review) { InputStream modelIn = new FileInputStream(modelPath + "en-token.zip"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME tokenizer = new TokenizerME(model); string[] tokens = tokenizer.tokenize(review.Replace(".", "")); return(tokens); }
/// <summary> /// Split the input content to individual words /// </summary> /// <param name="contents">Content to split into words</param> /// <returns></returns> public static IEnumerable <string> TokenizeNow(string contents) { //ToDo: Make preprocessing function a functional pointer var processedContents = PreProcessing(contents); var tokenizer = new TokenizerME(Model); var tokens = tokenizer.tokenize(processedContents); return(tokens); }
public void TestTokenizerSimpleModel() { var model = TokenizerTestUtil.CreateMaxentTokenModel(); var tokenizer = new TokenizerME(model); var tokens = tokenizer.Tokenize("test,"); Assert.AreEqual(2, tokens.Length); Assert.AreEqual("test", tokens[0]); Assert.AreEqual(",", tokens[1]); }
public static TokenizerModel CreateMaxentTokenModel() { using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) { var samples = new TokenSampleStream(new PlainTextByLineStream(data)); var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); return(TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams)); } }
private void LoadTokenizer() { if (!alreadyLoadTagger) { java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-token.bin"); TokenizerModel tokenizerModel = new TokenizerModel(modelInpStream); tokenizer = new TokenizerME(tokenizerModel); alreadyLoadTagger = true; } }
public override ObjectStream <NameSample> create(string[] args) { Parameters @params = ArgumentParser.parse(args, typeof(Parameters)); TokenizerModel tokenizerModel = (new TokenizerModelLoader()).load(@params.TokenizerModel); Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream <string> mucDocStream = new FileToStringSampleStream(new DirectorySampleStream(@params.Data, new FileFilterAnonymousInnerClassHelper(this), false), Charset.forName("UTF-8")); return(new MucNameSampleStream(tokenizer, mucDocStream)); }
public void Tokenize() { var modelStream = new java.io.FileInputStream("../../Models/en-token.bin"); var model = new TokenizerModel(modelStream); var tokenizer = new TokenizerME(model); var txt = File.ReadAllText(@"c:\dev\d-mill\uspe\Data\uspe-sentenced.txt"); var tokens = tokenizer.tokenize(txt); }
public static TokenizerModel TrainModel(string path) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); TokenSampleStream stream = new TokenSampleStream(new PlainTextByLineStream(fs)); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Cutoff, "0"); return(TokenizerME.Train(stream, new TokenizerFactory(TRAINING_LANGUAGE, null, true), trainParams)); }
public Span[] GetTokens(string paragraph) { var bin = GetFileStream("en-token.bin"); TokenizerModel model = new TokenizerModel(bin); TokenizerME tokenizer = new TokenizerME(model); Span[] tokens = tokenizer.tokenizePos(paragraph); bin.close(); return(tokens); }
public void TrainTokenizer() { var charset = Charset.forName("UTF-8"); var lineStream = new opennlp.tools.util.PlainTextByLineStream(new java.io.FileInputStream(@"c:\dev\d-mill\uspe\Data\uspe-sentenced-train.txt"), charset); var sampleStream = new opennlp.tools.namefind.NameSampleDataStream(lineStream); var model = TokenizerME.train("en", sampleStream, true); var tokenizer = new TokenizerME(model); var tokens = tokenizer.tokenize("Hi. How are you? This is Mike."); }
static void Main(string[] args) { String[] file = Directory.GetFiles(@"..\..\..\..\Dataset\", "*.html"); StreamWriter sw = new StreamWriter(@"..\..\..\Html.txt"); foreach (String files in file) { using (StreamReader sr = new StreamReader(files)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); string pattern = @"(<P>|<BR />|<DIV>)"; string replacement = ""; Regex rgx = new Regex(pattern); line = rgx.Replace(line, replacement); pattern = @"(</P>|<p/>|</DIV>)"; replacement = ""; rgx = new Regex(pattern); line = rgx.Replace(line, replacement); string regFind = @"<a.*?>" + @"(?'text'.*?)" + @"</a>"; string regReplace = @"${text}"; string regFindimg = @"<img.*?title=""" + @"(?'text2'.*?)" + @""" />"; string regReplaceimg = @"${text2}"; line = Regex.Replace(line, @" ", ""); line = Regex.Replace(line, regFind, regReplace); String[] tokens; InputStream modelIn = new FileInputStream(@"..\..\..\..\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(Regex.Replace(line, regFindimg, regReplaceimg)); for (int i = 0; i < tokens.Length; i++) { sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\n"); } } } } } sw.Close(); }
public void TestCrossCompatibility() { using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) { var samples = new TokenSampleStream(new PlainTextByLineStream(data)); var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams); var sMe = new TokenizerME(model); TokenizerMETest.TestTokenizer(sMe); var sProbs = sMe.TokenProbabilities; // --- java \/ var sFile = Path.GetTempFileName(); model.Serialize(new FileStream(sFile, FileMode.Create)); var jModel = new opennlp.tools.tokenize.TokenizerModel( OpenNLP.CreateInputStream(sFile) ); var jMe = new opennlp.tools.tokenize.TokenizerME(jModel); TestJavaTokenizer(jMe); var jProbs = jMe.getTokenProbabilities(); Assert.AreEqual(jProbs.Length, sProbs.Length); for (int i = 0; i < jProbs.Length; i++) { // one difference :( // -0.00000000000000011102230246251565 // // but still "insignificant" :) Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d); } } }
public static TokenizerModel CreateSimpleMaxentTokenModel() { var samples = new List <TokenSample> { new TokenSample("year", new[] { new Span(0, 4) }), new TokenSample("year,", new[] { new Span(0, 4), new Span(4, 5) }), new TokenSample("it,", new[] { new Span(0, 2), new Span(2, 3) }), new TokenSample("it", new[] { new Span(0, 2) }), new TokenSample("yes", new[] { new Span(0, 3) }), new TokenSample("yes,", new[] { new Span(0, 3), new Span(3, 4) }) }; var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); return(TokenizerME.Train( new CollectionObjectStream <TokenSample>(samples), new TokenizerFactory("en", null, true), mlParams)); }
public NLP() { //loading sentence detector model java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-sent.bin"); SentenceModel sentenceModel = new SentenceModel(modelInpStream); sentenceDetector = new SentenceDetectorME(sentenceModel); //loading tokenizer model modelInpStream = new java.io.FileInputStream("Resources\\en-token.bin"); TokenizerModel tokenizerModel = new TokenizerModel(modelInpStream); tokenizer = new TokenizerME(tokenizerModel); modelInpStream = new java.io.FileInputStream("Resources\\en-pos-maxent.bin"); POSModel posModel = new POSModel(modelInpStream); tagger = new POSTaggerME(posModel); modelInpStream = new java.io.FileInputStream("Resources\\en-chunker.bin"); ChunkerModel chunkerModel = new ChunkerModel(modelInpStream); chunker = new ChunkerME(chunkerModel); modelInpStream = new java.io.FileInputStream("Resources\\en-parser-chunking.bin"); ParserModel parserModel = new ParserModel(modelInpStream); parser = ParserFactory.create(parserModel); //loading stop words list StreamReader sr = new StreamReader("Resources\\english.stop.txt"); string line; while ((line = sr.ReadLine()) != null) { stopwords.Add(Stemming(line)); stopwords.Add(line); } }
static void Main(string[] args) { String[] file = Directory.GetFiles(@"C:\10311209\lab-6-opennlp-ju-zi-qie-fen-XiuXuanLiu\Dataset", "*.html"); StreamWriter sw = new StreamWriter(@"ReadByTokens.txt"); foreach (string filename in file) { using (StreamReader sr = new StreamReader(filename)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); line = Regex.Replace(line, "<P[^>]*>", ""); //[^>] 所有不是 > 的字 line = Regex.Replace(line, @" ", ""); line = Regex.Replace(line, "<DIV[^>]*>", ""); line = Regex.Replace(line, "<BR[^>]*>", ""); line = Regex.Replace(line, "<img[^>]*title=\"(?'titleName'.*?)\"[^>]*>", "${titleName}"); line = Regex.Replace(line, "<[^>]*href.*>(?'Name'.*?)<[^>]*>", "${Name}"); line = Regex.Replace(line, "<[^>]*>", ""); string[] tokens; InputStream modelIn = new FileInputStream(@"C:\10311209\lab-6-opennlp-ju-zi-qie-fen-XiuXuanLiu\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(line); for (int i = 0; i < tokens.Length; i++) { sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\n"); } } } } } sw.Close(); }
static void Main(string[] args) { java.io.InputStream modelIn = new java.io.FileInputStream(string.Format("en-sent.bin")); java.io.InputStream modelIn2 = new java.io.FileInputStream(string.Format("en-token.bin")); TokenizerModel model = new TokenizerModel(modelIn2); TokenizerME mE = new TokenizerME(model); SentenceModel sM = new SentenceModel(modelIn); SentenceDetector detector = new SentenceDetectorME(sM); string folderName = @"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\file"; foreach (string fname in System.IO.Directory.GetFiles(folderName)) { String line = null; String[] name = fname.Split('\\'); StreamWriter sw = new StreamWriter(@"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\answer\" + name[6]); StreamReader file2 = new StreamReader(fname); while ((line = file2.ReadLine()) != null) { string str = null; string[] sents = detector.sentDetect(line); if (sents.Length.Equals(0)) { continue; } foreach (var s in sents) { str = str + s; } var Tokens = mE.tokenize(str); foreach (var s in Tokens) { sw.Write(s + " "); } sw.WriteLine(); } sw.Close(); } }
public override ObjectStream <CorefSample> create(string[] args) { Parameters @params = ArgumentParser.parse(args, typeof(Parameters)); ParserModel parserModel = (new ParserModelLoader()).load(@params.ParserModel); Parser parser = ParserFactory.create(parserModel); TokenizerModel tokenizerModel = (new TokenizerModelLoader()).load(@params.TokenizerModel); Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream <string> mucDocStream = new FileToStringSampleStream(new DirectorySampleStream(@params.Data, new FileFilterAnonymousInnerClassHelper(this), false), Charset.forName("UTF-8")); ObjectStream <RawCorefSample> rawSamples = new MucCorefSampleStream(tokenizer, mucDocStream); ObjectStream <RawCorefSample> parsedSamples = new FullParseCorefEnhancerStream(parser, rawSamples); // How to load all these nameFinder models ?! // Lets make a param per model, not that nice, but ok! IDictionary <string, Jfile> modelFileTagMap = new Dictionary <string, Jfile>(); modelFileTagMap["person"] = @params.PersonModel; modelFileTagMap["organization"] = @params.OrganizationModel; IList <TokenNameFinder> nameFinders = new List <TokenNameFinder>(); IList <string> tags = new List <string>(); foreach (KeyValuePair <string, Jfile> entry in modelFileTagMap) { nameFinders.Add(new NameFinderME((new TokenNameFinderModelLoader()).load(entry.Value))); tags.Add(entry.Key); } return(new MucMentionInserterStream(new NameFinderCorefEnhancerStream(nameFinders.ToArray(), tags.ToArray(), parsedSamples))); }
public Dictionary <string, List <string> > Main(string line) { //debug sentence // line = "Show me the sales of Kean Cola .25ltr Bottle in Nicosia from January 2017 to October 2017 as a line chart."; matchedWords?.Clear(); nounPhrases?.Clear(); nouns?.Clear(); adjectivePhrases?.Clear(); verbPhrases?.Clear(); InputStream modelIn = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-parser-chunking.bin"); InputStream modelIn1 = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-ner-date.bin"); InputStream modelIn2 = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-token.bin"); ParserModel model = new ParserModel(modelIn); var myParser = ParserFactory.create(model); var topParses = ParserTool.parseLine(line, myParser, 1); foreach (var p in topParses) { GetSentenceParts(p); } try { TokenizerModel model1 = new TokenizerModel(modelIn2); TokenNameFinderModel model2 = new TokenNameFinderModel(modelIn1); Tokenizer tokenizer = new TokenizerME(model1); var nameFinder = new NameFinderME(model2); var tokens = tokenizer.tokenize(line); var nameSpans = nameFinder.find(tokens); var array = Span.spansToStrings(nameSpans, tokens); // // foreach (var v in array) // { // System.Diagnostics.Debug.WriteLine(v); // } dates = new HashSet <string>(array); PrintSets(); // System.Diagnostics.Debug.WriteLine("\nProcessing Presentation type"); // // if (nouns.Contains("table")) // { // matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "table")); // } // if (nounPhrases.Contains("bar chart")) // { // matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "bar chart")); // } // if (nounPhrases.Contains("line chart")) // { // matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "line chart")); // } //TODO IF NO OPTION IS FOUND ASK THE USER TO GIVE YOU ONE. IMPLEMENT IT IN THE WEB VERSION SOON System.Diagnostics.Debug.WriteLine("\nProcessing Dates"); if (dates.Count == 2) { if (dates.ElementAt(0).contains("from")) { var a = dates.ElementAt(0).replace("from", ""); List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(a, newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(1), newList); } else { List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(dates.ElementAt(0), newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(1), newList); } } if (dates.Count == 1) { if (dates.ElementAt(0).contains("from")) { var a = dates.ElementAt(0).replace("from", ""); var dts = a.Split(new[] { " to " }, StringSplitOptions.None); List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(dts[0], newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dts[1], newList); } else { List <string> newList = new List <string>(); newList.Add("START_PERIOD"); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(0), newList); } } System.Diagnostics.Debug.WriteLine("\nProcessing noun phrases"); // var manager = new Manager(); // var serializer = new XmlSerializer(typeof(Manager.language)); // var loadStream = new FileStream("file2.xml", FileMode.Open, FileAccess.Read); // var loadedObject = (Manager.language) serializer.Deserialize(loadStream); var doc = new XmlDocument(); // System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath); // System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath); // System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath); // System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath); doc.Load(HttpRuntime.AppDomainAppPath + "\\file2.xml"); var root = doc.SelectSingleNode("*"); FindMatchingNodesFromXml(root, nounPhrases); foreach (var item in nouns.ToList()) { foreach (var VARIABLE in matchedWords) { if (VARIABLE.Key.Contains(item)) { nouns.Remove(item); //Will work! } } } FindMatchingNodesFromXml(root, verbPhrases); // FindMatchingNodesFromXml(root, nouns); System.Diagnostics.Debug.WriteLine("\nProcessing verb phrases "); System.Diagnostics.Debug.WriteLine("\nProcessing nouns "); // construct the dictionary object and open it var directory = Directory.GetCurrentDirectory() + "\\wordnet\\"; foreach (var variable in matchedWords) { System.Diagnostics.Debug.WriteLine(variable.Value + "\t\t" + variable.Key); } foreach (var variable in matchedWords) { string a = variable.Key; if (line.Contains(a)) { line = line.replace(a, ""); } } foreach (var variable in stopWordsofwordnet) { string a = " " + variable.toLowerCase() + " "; if (line.Contains(a)) { line = line.replace(a, " "); } } if (line.contains(".")) { line = line.replace(".", ""); } if (line.contains("-")) { line = line.replace("-", " "); } System.Diagnostics.Debug.WriteLine("/////////////"); System.Diagnostics.Debug.WriteLine("SECOND PARSE STRING " + line); System.Diagnostics.Debug.WriteLine("/////////////"); line = line.Trim(); topParses = ParserTool.parseLine(line, myParser, 1); nounPhrases?.Clear(); dates?.Clear(); verbPhrases?.Clear(); nouns?.Clear(); foreach (var p in topParses) { //p.show(); GetSentenceParts(p); } FindMatchingNodesFromXml(root, nounPhrases); foreach (var item in nouns.ToList()) { foreach (var VARIABLE in matchedWords) { if (VARIABLE.Key.Contains(item)) { nouns.Remove(item); //Will work! } } } FindMatchingNodesFromXml(root, verbPhrases); FindMatchingNodesFromXml(root, nouns); tokens = tokenizer.tokenize(line); nameSpans = nameFinder.find(tokens); array = Span.spansToStrings(nameSpans, tokens); dates = new HashSet <string>(array); PrintSets(); System.Diagnostics.Debug.WriteLine("\nProcessing Dates"); if (dates.Count == 2) { if (dates.ElementAt(0).contains("from")) { var a = dates.ElementAt(0).replace("from", ""); List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(a, newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(1), newList); } else { List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(dates.ElementAt(0), newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(1), newList); } } if (dates.Count == 1) { if (dates.ElementAt(0).contains("from")) { var a = dates.ElementAt(0).replace("from", ""); var dts = a.Split(new[] { " to " }, StringSplitOptions.None); List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(dts[0], newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dts[1], newList); } else { List <string> newList = new List <string>(); newList.Add("START_PERIOD"); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(0), newList); } } System.Diagnostics.Debug.WriteLine("\nProcessing noun phrases"); // var manager = new Manager(); // var serializer = new XmlSerializer(typeof(Manager.language)); // var loadStream = new FileStream("file2.xml", FileMode.Open, FileAccess.Read); // var loadedObject = (Manager.language) serializer.Deserialize(loadStream); FindMatchingNodesFromXml(root, nounPhrases); FindMatchingNodesFromXml(root, verbPhrases); FindMatchingNodesFromXml(root, nouns); foreach (var variable in matchedWords) { System.Diagnostics.Debug.WriteLine(variable.Value + "\t\t" + variable.Key); } doc = null; GC.Collect(); GC.WaitForPendingFinalizers(); //MATCHING WITH WORD NET System.Diagnostics.Debug.WriteLine(directory); // var wordNet = new WordNetEngine(); // // wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.adj")), PartOfSpeech.Adjective); // wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.adv")), PartOfSpeech.Adverb); // wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.noun")), PartOfSpeech.Noun); // wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.verb")), PartOfSpeech.Verb); // // wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.adj")), PartOfSpeech.Adjective); // wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.adv")), PartOfSpeech.Adverb); // wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.noun")), PartOfSpeech.Noun); // wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.verb")), PartOfSpeech.Verb); // // System.Diagnostics.Debug.WriteLine("Loading database..."); // wordNet.Load(); // System.Diagnostics.Debug.WriteLine("Load completed."); // while (true) // { // System.Diagnostics.Debug.WriteLine("\nType first word"); // // var word = System.Diagnostics.Debug.ReadLine(); // var synSetList = wordNet.GetSynSets(word); // // if (synSetList.Count == 0) System.Diagnostics.Debug.WriteLine($"No SynSet found for '{word}'"); // // foreach (var synSet in synSetList) // { // var words = string.Join(", ", synSet.Words); // // System.Diagnostics.Debug.WriteLine($"\nWords: {words}"); // } // } } catch (IOException e) { e.printStackTrace(); } finally { if (modelIn1 != null) { try { modelIn1.close(); } catch (IOException e) { } } if (modelIn2 != null) { try { modelIn2.close(); } catch (IOException e) { } } // truncateLists(ref nounPhrases); // truncateLists(ref nouns); // truncateLists(ref dates); // truncateLists(ref verbPhrases); } return(matchedWords); }
public string grabPossiblePunSentences(string currentSentence) { try { java.io.InputStream modelIn = new java.io.FileInputStream(@"C:\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); Tokenizer tokenizer = new TokenizerME(model); string[] words = tokenizer.tokenize(currentSentence); List<string> possibleSentences = new List<string>(); Homonyms homonyms = new Homonyms(); for (int i = 0; i < words.Length; i++) { System.Console.WriteLine(); Homonym homonym = homonyms.findWordInList(words[i]); if (homonym.homonyms == null) { } else { string possibleSentence = ""; for (int r = 0; r < words.Length; r++) { if (words[i].Equals(words[r])) { Random random = new Random(); int randomNumber = random.Next(homonym.homonyms.Length); possibleSentence += " " + homonym.homonyms[randomNumber]; } else { possibleSentence += " " + words[r]; } } possibleSentences.Add(possibleSentence); } } currentSentence = choosePossiblePunSentence(currentSentence, possibleSentences); } catch (Exception e) { } return currentSentence; }
public NLPTokenizerOp() { tokenizer = null; }
public NLPTokenizerOp(TokenizerModel model) { tokenizer = new TokenizerME(model); }
public Tokenizer(TokenizerModel model) { this.tokenizer = new TokenizerME(model); }
public Tokenizer(FileStream modelStream) { TokenizerModel model = new TokenizerModel(modelStream); this.tokenizer = new TokenizerME(model); }
public void giveDefinitionAndHomonym(string currentSentence) { try { java.io.InputStream modelIn = new java.io.FileInputStream(@"C:\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); Tokenizer tokenizer = new TokenizerME(model); string[] words = tokenizer.tokenize(currentSentence); Homonyms homonyms = new Homonyms(); for (int i = 0; i < words.Length; i++) { System.Console.WriteLine(); Homonym homonym = homonyms.findWordInList(words[i]); if (homonym.homonyms == null) { System.Console.WriteLine("No homonyms found for: " + words[i]); } else { List<string> selectedHomonyms = homonym.selectedHomonyms(); System.Console.WriteLine("Homonyms are: " + words[i]); foreach (string selectedWord in selectedHomonyms) { System.Console.Write(selectedWord + ","); } } System.Console.WriteLine(); System.Console.WriteLine("Definition for: " + words[i]); using (WebClient client = new WebClient()) { string line = client.DownloadString("http://api.wordnik.com/v4/word.json/" + words[i] + "/definitions?limit=200&includeRelated=true&useCanonical=false&includeTags=false&api_key=a2a73e7b926c924fad7001ca3111acd55af2ffabf50eb4ae5"); if (!line.Equals("[]")) { string[] lines1 = System.Text.RegularExpressions.Regex.Split(line, "\"text\":\""); string[] lines2 = System.Text.RegularExpressions.Regex.Split(lines1[1], "\",\"sequence\"[\\W\\w]+"); System.Console.WriteLine(lines2[0]); } else { System.Console.WriteLine("Definition cannot be found, word is mispelled or doesn't exist within our current data"); } } } } catch (Exception e) { System.Console.WriteLine(e.Message); } }
public static IEnumerable<IEnumerable<ChunkItem>> GetChunks(IEnumerable<string> Sentences) { var posModelStream = new java.io.ByteArrayInputStream(Resource.en_pos_maxent);//new java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-pos-maxent.bin"); var posModel = new POSModel(posModelStream); var pos = new POSTaggerME(posModel); var modelStream = new java.io.ByteArrayInputStream(Resource.en_token); //java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-token.bin"); var model = new TokenizerModel(modelStream); var tokenizer = new TokenizerME(model); var chunkerModelStream = new java.io.ByteArrayInputStream(Resource.en_chunker); var chunkerModel = new ChunkerModel(chunkerModelStream); var chunker = new ChunkerME(chunkerModel); return Sentences.Select(p => { var tokens = tokenizer.tokenize(p); var tags = pos.tag(tokens); var chunks = chunker.chunk(tokens, tags); var res = new List<ChunkItem>(); for (var i = 0; i < chunks.Length; i++) { res.Add(new ChunkItem { token = tokens[i], tag = tags[i], chunk = chunks[i] }); } return res; }); }
private static TokenizerModel Train(TokenizerFactory factory) { return(TokenizerME.Train(CreateSampleStream(), factory, TrainingParameters.DefaultParameters())); }
public Tokenizer() { this.tokenizer = new TokenizerME(TrainModel(Environment.CurrentDirectory + TRAINING_MODEL_PATH)); }