private void InitializeTokenizer() { InputStream modelIn = null; try { modelIn = new FileInputStream(TokenizerModel); TokenizerModel model = new TokenizerModel(modelIn); tokenizer = new TokenizerME(model); } catch (IOException ex) { tokenizer = null; } finally { if (modelIn != null) { try { modelIn.close(); } catch (IOException ex) { } } } }
private static TokenizerME PrepareTokenizer() { var tokenInputStream = new FileInputStream(TokenModelPath); //load the token model into a stream var tokenModel = new TokenizerModel(tokenInputStream); //load the token model return(new TokenizerME(tokenModel)); //create the tokenizer }
static void Main(string[] args) { String[] files = Directory.GetFiles(@"..\..\..\..\Dataset", "lab5result.txt"); StreamWriter sw = new StreamWriter(@"..\..\..\..\lab6result.txt", false); foreach (string file in files) { using (StreamReader sr = new StreamReader(file)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); string[] tokens; InputStream modelIn = new FileInputStream(@"..\..\..\..\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(line); for (int i = 0; i < tokens.Length; i++) { sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\r\n"); } } } } } sw.Close(); }
static void Main(string[] args) { String[] file = Directory.GetFiles(@"C:\10311171\lab-6-opennlp-ju-zi-qie-fen-HUNGLIWEN\lab6\Dataset", "*.html"); StreamWriter sw = new StreamWriter(@"ReadByTokens.txt"); foreach (string filename in file) { using (StreamReader sr = new StreamReader(filename)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); string[] tokens; InputStream modelIn = new FileInputStream(@"C:\10311171\lab-6-opennlp-ju-zi-qie-fen-HUNGLIWEN\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(line); for (int i = 0; i < tokens.Length; i++) { sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\n"); } } } } } sw.Close(); }
static void Main(string[] args) { String[] files = Directory.GetFiles(@"C:\10311173\lab-5-opennlp-tokenization-Changyiyu\Dataset", "*.html"); StreamWriter sw = new StreamWriter(@"TokensReader.txt"); foreach (string thefile in files) { using (StreamReader sr = new StreamReader(thefile)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); string[] tokens; InputStream modelIn = new FileInputStream(@"C:\10311173\lab-6-opennlp-ju-zi-qie-fen-Changyiyu\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(line); for (int i = 0; i < tokens.Length; i++) { //Regex reg = new Regex("."); //Match m = reg.Match(tokens[i]); sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\r\n"); } } } } } sw.Close(); }
string[] Tokenizer(string review) { InputStream modelIn = new FileInputStream(modelPath + "en-token.zip"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME tokenizer = new TokenizerME(model); string[] tokens = tokenizer.tokenize(review.Replace(".", "")); return(tokens); }
public Span[] GetTokens(string paragraph) { var bin = GetFileStream("en-token.bin"); TokenizerModel model = new TokenizerModel(bin); TokenizerME tokenizer = new TokenizerME(model); Span[] tokens = tokenizer.tokenizePos(paragraph); bin.close(); return(tokens); }
public static NLPTokenizerOp GetTokenizer(string modelName) { if (modelName == null) { return(new NLPTokenizerOp()); } else { TokenizerModel model = tokenizerModels[modelName]; return(new NLPTokenizerOp(model)); } }
public static TokenizerModel GetTokenizerModel(string modelName, IResourceLoader loader) { if (!tokenizerModels.TryGetValue(modelName, out TokenizerModel model) || model == null) { using (Stream resource = loader.OpenResource(modelName)) { model = new TokenizerModel(new ikvm.io.InputStreamWrapper(resource)); } tokenizerModels[modelName] = model; } return(model); }
static Tokenizer() { var modelFile = HttpContext.Current.Server.MapPath("~/Files/TextAnalytics/en-token.bin"); if (!File.Exists(modelFile)) { throw new FileNotFoundException("Unable to find tokenizer model file at " + modelFile); } ModelIn = new java.io.FileInputStream(modelFile); Model = new TokenizerModel(ModelIn); }
public void Tokenize() { var modelStream = new java.io.FileInputStream("../../Models/en-token.bin"); var model = new TokenizerModel(modelStream); var tokenizer = new TokenizerME(model); var txt = File.ReadAllText(@"c:\dev\d-mill\uspe\Data\uspe-sentenced.txt"); var tokens = tokenizer.tokenize(txt); }
static Tokenizer() { var modelFile = ConfigurationManager.AppSettings["ModelTokenizer"] ?? string.Empty; if(string.IsNullOrWhiteSpace(modelFile)) throw new Exception("ModelTokenizer setting not defined in App.Config"); modelFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, modelFile); if(!File.Exists(modelFile)) throw new FileNotFoundException("Unable to find tokenizer model file at " + modelFile); ModelIn = new java.io.FileInputStream(modelFile); Model = new TokenizerModel(ModelIn); }
public override void run(string[] args) { if (args.Length != 1) { Console.WriteLine(Help); } else { TokenizerModel model = (new TokenizerModelLoader()).load(new File(args[0])); CommandLineTokenizer tokenizer = new CommandLineTokenizer(new opennlp.tools.tokenize.TokenizerME(model)); tokenizer.process(); } }
static void Main(string[] args) { String[] file = Directory.GetFiles(@"..\..\..\..\Dataset\", "*.html"); StreamWriter sw = new StreamWriter(@"..\..\..\Html.txt"); foreach (String files in file) { using (StreamReader sr = new StreamReader(files)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); string pattern = @"(<P>|<BR />|<DIV>)"; string replacement = ""; Regex rgx = new Regex(pattern); line = rgx.Replace(line, replacement); pattern = @"(</P>|<p/>|</DIV>)"; replacement = ""; rgx = new Regex(pattern); line = rgx.Replace(line, replacement); string regFind = @"<a.*?>" + @"(?'text'.*?)" + @"</a>"; string regReplace = @"${text}"; string regFindimg = @"<img.*?title=""" + @"(?'text2'.*?)" + @""" />"; string regReplaceimg = @"${text2}"; line = Regex.Replace(line, @" ", ""); line = Regex.Replace(line, regFind, regReplace); String[] tokens; InputStream modelIn = new FileInputStream(@"..\..\..\..\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(Regex.Replace(line, regFindimg, regReplaceimg)); for (int i = 0; i < tokens.Length; i++) { sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\n"); } } } } } sw.Close(); }
static Tokenizer() { var modelFile = ConfigurationManager.AppSettings["ModelTokenizer"] ?? string.Empty; if (string.IsNullOrWhiteSpace(modelFile)) { throw new Exception("ModelTokenizer setting not defined in App.Config"); } modelFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, modelFile); if (!File.Exists(modelFile)) { throw new FileNotFoundException("Unable to find tokenizer model file at " + modelFile); } ModelIn = new java.io.FileInputStream(modelFile); Model = new TokenizerModel(ModelIn); }
public override void run(string format, string[] args) { base.run(format, args); TokenizerModel model = (new TokenizerModelLoader()).load(@params.Model); TokenizerEvaluationMonitor misclassifiedListener = null; if (@params.Misclassified.Value) { misclassifiedListener = new TokenEvaluationErrorListener(); } TokenizerEvaluator evaluator = new TokenizerEvaluator(new opennlp.tools.tokenize.TokenizerME(model), misclassifiedListener); Console.Write("Evaluating ... "); try { evaluator.evaluate(sampleStream); } catch (IOException e) { Console.Error.WriteLine("failed"); throw new TerminateToolException(-1, "IO error while reading test data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } Console.WriteLine("done"); Console.WriteLine(); Console.WriteLine(evaluator.FMeasure); }
public void TestDummyFactory() { const string lang = "es"; const string pattern = "^[0-9A-Za-z]+$"; var dic = LoadAbbDictionary(); var model = Train(new DummyTokenizerFactory(lang, dic, true, pattern)); Assert.IsInstanceOf(typeof(DummyTokenizerFactory), model.Factory); var factory = model.Factory; Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyDictionary), factory.AbbreviationDictionary); Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyContextGenerator), factory.ContextGenerator); Assert.AreEqual(pattern, factory.AlphaNumericPattern); Assert.AreEqual(lang, factory.LanguageCode); Assert.AreEqual(lang, model.Language); Assert.AreEqual(true, factory.UseAlphaNumericOptimization); using (var data = new MemoryStream()) { model.Serialize(new UnclosableStream(data)); data.Seek(0, SeekOrigin.Begin); var fromSerialized = new TokenizerModel(data); Assert.IsInstanceOf(typeof(DummyTokenizerFactory), fromSerialized.Factory); factory = fromSerialized.Factory; Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyDictionary), factory.AbbreviationDictionary); Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyContextGenerator), factory.ContextGenerator); Assert.AreEqual(pattern, factory.AlphaNumericPattern); Assert.AreEqual(lang, factory.LanguageCode); Assert.AreEqual(lang, fromSerialized.Language); Assert.AreEqual(true, factory.UseAlphaNumericOptimization); } }
public NLP() { //loading sentence detector model java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-sent.bin"); SentenceModel sentenceModel = new SentenceModel(modelInpStream); sentenceDetector = new SentenceDetectorME(sentenceModel); //loading tokenizer model modelInpStream = new java.io.FileInputStream("Resources\\en-token.bin"); TokenizerModel tokenizerModel = new TokenizerModel(modelInpStream); tokenizer = new TokenizerME(tokenizerModel); modelInpStream = new java.io.FileInputStream("Resources\\en-pos-maxent.bin"); POSModel posModel = new POSModel(modelInpStream); tagger = new POSTaggerME(posModel); modelInpStream = new java.io.FileInputStream("Resources\\en-chunker.bin"); ChunkerModel chunkerModel = new ChunkerModel(modelInpStream); chunker = new ChunkerME(chunkerModel); modelInpStream = new java.io.FileInputStream("Resources\\en-parser-chunking.bin"); ParserModel parserModel = new ParserModel(modelInpStream); parser = ParserFactory.create(parserModel); //loading stop words list StreamReader sr = new StreamReader("Resources\\english.stop.txt"); string line; while ((line = sr.ReadLine()) != null) { stopwords.Add(Stemming(line)); stopwords.Add(line); } }
static void Main(string[] args) { String[] file = Directory.GetFiles(@"C:\10311209\lab-6-opennlp-ju-zi-qie-fen-XiuXuanLiu\Dataset", "*.html"); StreamWriter sw = new StreamWriter(@"ReadByTokens.txt"); foreach (string filename in file) { using (StreamReader sr = new StreamReader(filename)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); line = Regex.Replace(line, "<P[^>]*>", ""); //[^>] 所有不是 > 的字 line = Regex.Replace(line, @" ", ""); line = Regex.Replace(line, "<DIV[^>]*>", ""); line = Regex.Replace(line, "<BR[^>]*>", ""); line = Regex.Replace(line, "<img[^>]*title=\"(?'titleName'.*?)\"[^>]*>", "${titleName}"); line = Regex.Replace(line, "<[^>]*href.*>(?'Name'.*?)<[^>]*>", "${Name}"); line = Regex.Replace(line, "<[^>]*>", ""); string[] tokens; InputStream modelIn = new FileInputStream(@"C:\10311209\lab-6-opennlp-ju-zi-qie-fen-XiuXuanLiu\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); TokenizerME enTokenizer = new TokenizerME(model); tokens = enTokenizer.tokenize(line); for (int i = 0; i < tokens.Length; i++) { sw.Write(tokens[i] + " "); if (tokens[i].Equals(".")) { sw.Write("\n"); } } } } } sw.Close(); }
static void Main(string[] args) { java.io.InputStream modelIn = new java.io.FileInputStream(string.Format("en-sent.bin")); java.io.InputStream modelIn2 = new java.io.FileInputStream(string.Format("en-token.bin")); TokenizerModel model = new TokenizerModel(modelIn2); TokenizerME mE = new TokenizerME(model); SentenceModel sM = new SentenceModel(modelIn); SentenceDetector detector = new SentenceDetectorME(sM); string folderName = @"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\file"; foreach (string fname in System.IO.Directory.GetFiles(folderName)) { String line = null; String[] name = fname.Split('\\'); StreamWriter sw = new StreamWriter(@"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\answer\" + name[6]); StreamReader file2 = new StreamReader(fname); while ((line = file2.ReadLine()) != null) { string str = null; string[] sents = detector.sentDetect(line); if (sents.Length.Equals(0)) { continue; } foreach (var s in sents) { str = str + s; } var Tokens = mE.tokenize(str); foreach (var s in Tokens) { sw.Write(s + " "); } sw.WriteLine(); } sw.Close(); } }
public void TestDefault() { var dict = LoadAbbDictionary(); const string lang = "es"; var model = Train(new TokenizerFactory(lang, dict, false, null)); var factory = model.Factory; Assert.IsInstanceOf(typeof(Dict), factory.AbbreviationDictionary); Assert.IsInstanceOf(typeof(DefaultTokenContextGenerator), factory.ContextGenerator); Assert.AreEqual(Factory.DefaultAlphanumeric, factory.AlphaNumericPattern); Assert.AreEqual(lang, factory.LanguageCode); Assert.AreEqual(lang, model.Language); Assert.AreEqual(false, factory.UseAlphaNumericOptimization); using (var data = new MemoryStream()) { model.Serialize(new UnclosableStream(data)); data.Seek(0, SeekOrigin.Begin); var fromSerialized = new TokenizerModel(data); factory = fromSerialized.Factory; Assert.IsInstanceOf(typeof(Dict), factory.AbbreviationDictionary); Assert.IsInstanceOf(typeof(DefaultTokenContextGenerator), factory.ContextGenerator); Assert.AreEqual(Factory.DefaultAlphanumeric, factory.AlphaNumericPattern); Assert.AreEqual(lang, factory.LanguageCode); Assert.AreEqual(lang, fromSerialized.Language); Assert.AreEqual(false, factory.UseAlphaNumericOptimization); } }
public override ObjectStream <CorefSample> create(string[] args) { Parameters @params = ArgumentParser.parse(args, typeof(Parameters)); ParserModel parserModel = (new ParserModelLoader()).load(@params.ParserModel); Parser parser = ParserFactory.create(parserModel); TokenizerModel tokenizerModel = (new TokenizerModelLoader()).load(@params.TokenizerModel); Tokenizer tokenizer = new TokenizerME(tokenizerModel); ObjectStream <string> mucDocStream = new FileToStringSampleStream(new DirectorySampleStream(@params.Data, new FileFilterAnonymousInnerClassHelper(this), false), Charset.forName("UTF-8")); ObjectStream <RawCorefSample> rawSamples = new MucCorefSampleStream(tokenizer, mucDocStream); ObjectStream <RawCorefSample> parsedSamples = new FullParseCorefEnhancerStream(parser, rawSamples); // How to load all these nameFinder models ?! // Lets make a param per model, not that nice, but ok! IDictionary <string, Jfile> modelFileTagMap = new Dictionary <string, Jfile>(); modelFileTagMap["person"] = @params.PersonModel; modelFileTagMap["organization"] = @params.OrganizationModel; IList <TokenNameFinder> nameFinders = new List <TokenNameFinder>(); IList <string> tags = new List <string>(); foreach (KeyValuePair <string, Jfile> entry in modelFileTagMap) { nameFinders.Add(new NameFinderME((new TokenNameFinderModelLoader()).load(entry.Value))); tags.Add(entry.Key); } return(new MucMentionInserterStream(new NameFinderCorefEnhancerStream(nameFinders.ToArray(), tags.ToArray(), parsedSamples))); }
public Tokenizer(FileStream modelStream) { TokenizerModel model = new TokenizerModel(modelStream); this.tokenizer = new TokenizerME(model); }
private static TokenizerME PrepareTokenizer() { var tokenInputStream = new FileInputStream(TokenModelPath); //load the token model into a stream var tokenModel = new TokenizerModel(tokenInputStream); //load the token model return new TokenizerME(tokenModel); //create the tokenizer }
private TokenizerME PrepareTokenizer() { //TODO[danielcamargo]: we need to find/train the model in spanish var tokenInputStream = new FileInputStream(string.Format(@"Models\{0}-token.bin", _language == "es" ? "pt" : _language)); var tokenModel = new TokenizerModel(tokenInputStream); tokenInputStream.close(); return new TokenizerME(tokenModel); }
public void giveDefinitionAndHomonym(string currentSentence) { try { java.io.InputStream modelIn = new java.io.FileInputStream(@"C:\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); Tokenizer tokenizer = new TokenizerME(model); string[] words = tokenizer.tokenize(currentSentence); Homonyms homonyms = new Homonyms(); for (int i = 0; i < words.Length; i++) { System.Console.WriteLine(); Homonym homonym = homonyms.findWordInList(words[i]); if (homonym.homonyms == null) { System.Console.WriteLine("No homonyms found for: " + words[i]); } else { List<string> selectedHomonyms = homonym.selectedHomonyms(); System.Console.WriteLine("Homonyms are: " + words[i]); foreach (string selectedWord in selectedHomonyms) { System.Console.Write(selectedWord + ","); } } System.Console.WriteLine(); System.Console.WriteLine("Definition for: " + words[i]); using (WebClient client = new WebClient()) { string line = client.DownloadString("http://api.wordnik.com/v4/word.json/" + words[i] + "/definitions?limit=200&includeRelated=true&useCanonical=false&includeTags=false&api_key=a2a73e7b926c924fad7001ca3111acd55af2ffabf50eb4ae5"); if (!line.Equals("[]")) { string[] lines1 = System.Text.RegularExpressions.Regex.Split(line, "\"text\":\""); string[] lines2 = System.Text.RegularExpressions.Regex.Split(lines1[1], "\",\"sequence\"[\\W\\w]+"); System.Console.WriteLine(lines2[0]); } else { System.Console.WriteLine("Definition cannot be found, word is mispelled or doesn't exist within our current data"); } } } } catch (Exception e) { System.Console.WriteLine(e.Message); } }
public string grabPossiblePunSentences(string currentSentence) { try { java.io.InputStream modelIn = new java.io.FileInputStream(@"C:\en-token.bin"); TokenizerModel model = new TokenizerModel(modelIn); Tokenizer tokenizer = new TokenizerME(model); string[] words = tokenizer.tokenize(currentSentence); List<string> possibleSentences = new List<string>(); Homonyms homonyms = new Homonyms(); for (int i = 0; i < words.Length; i++) { System.Console.WriteLine(); Homonym homonym = homonyms.findWordInList(words[i]); if (homonym.homonyms == null) { } else { string possibleSentence = ""; for (int r = 0; r < words.Length; r++) { if (words[i].Equals(words[r])) { Random random = new Random(); int randomNumber = random.Next(homonym.homonyms.Length); possibleSentence += " " + homonym.homonyms[randomNumber]; } else { possibleSentence += " " + words[r]; } } possibleSentences.Add(possibleSentence); } } currentSentence = choosePossiblePunSentence(currentSentence, possibleSentences); } catch (Exception e) { } return currentSentence; }
public static IEnumerable<IEnumerable<ChunkItem>> GetChunks(IEnumerable<string> Sentences) { var posModelStream = new java.io.ByteArrayInputStream(Resource.en_pos_maxent);//new java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-pos-maxent.bin"); var posModel = new POSModel(posModelStream); var pos = new POSTaggerME(posModel); var modelStream = new java.io.ByteArrayInputStream(Resource.en_token); //java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-token.bin"); var model = new TokenizerModel(modelStream); var tokenizer = new TokenizerME(model); var chunkerModelStream = new java.io.ByteArrayInputStream(Resource.en_chunker); var chunkerModel = new ChunkerModel(chunkerModelStream); var chunker = new ChunkerME(chunkerModel); return Sentences.Select(p => { var tokens = tokenizer.tokenize(p); var tags = pos.tag(tokens); var chunks = chunker.chunk(tokens, tags); var res = new List<ChunkItem>(); for (var i = 0; i < chunks.Length; i++) { res.Add(new ChunkItem { token = tokens[i], tag = tags[i], chunk = chunks[i] }); } return res; }); }
public Dictionary <string, List <string> > Main(string line) { //debug sentence // line = "Show me the sales of Kean Cola .25ltr Bottle in Nicosia from January 2017 to October 2017 as a line chart."; matchedWords?.Clear(); nounPhrases?.Clear(); nouns?.Clear(); adjectivePhrases?.Clear(); verbPhrases?.Clear(); InputStream modelIn = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-parser-chunking.bin"); InputStream modelIn1 = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-ner-date.bin"); InputStream modelIn2 = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-token.bin"); ParserModel model = new ParserModel(modelIn); var myParser = ParserFactory.create(model); var topParses = ParserTool.parseLine(line, myParser, 1); foreach (var p in topParses) { GetSentenceParts(p); } try { TokenizerModel model1 = new TokenizerModel(modelIn2); TokenNameFinderModel model2 = new TokenNameFinderModel(modelIn1); Tokenizer tokenizer = new TokenizerME(model1); var nameFinder = new NameFinderME(model2); var tokens = tokenizer.tokenize(line); var nameSpans = nameFinder.find(tokens); var array = Span.spansToStrings(nameSpans, tokens); // // foreach (var v in array) // { // System.Diagnostics.Debug.WriteLine(v); // } dates = new HashSet <string>(array); PrintSets(); // System.Diagnostics.Debug.WriteLine("\nProcessing Presentation type"); // // if (nouns.Contains("table")) // { // matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "table")); // } // if (nounPhrases.Contains("bar chart")) // { // matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "bar chart")); // } // if (nounPhrases.Contains("line chart")) // { // matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "line chart")); // } //TODO IF NO OPTION IS FOUND ASK THE USER TO GIVE YOU ONE. IMPLEMENT IT IN THE WEB VERSION SOON System.Diagnostics.Debug.WriteLine("\nProcessing Dates"); if (dates.Count == 2) { if (dates.ElementAt(0).contains("from")) { var a = dates.ElementAt(0).replace("from", ""); List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(a, newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(1), newList); } else { List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(dates.ElementAt(0), newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(1), newList); } } if (dates.Count == 1) { if (dates.ElementAt(0).contains("from")) { var a = dates.ElementAt(0).replace("from", ""); var dts = a.Split(new[] { " to " }, StringSplitOptions.None); List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(dts[0], newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dts[1], newList); } else { List <string> newList = new List <string>(); newList.Add("START_PERIOD"); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(0), newList); } } System.Diagnostics.Debug.WriteLine("\nProcessing noun phrases"); // var manager = new Manager(); // var serializer = new XmlSerializer(typeof(Manager.language)); // var loadStream = new FileStream("file2.xml", FileMode.Open, FileAccess.Read); // var loadedObject = (Manager.language) serializer.Deserialize(loadStream); var doc = new XmlDocument(); // System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath); // System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath); // System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath); // System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath); doc.Load(HttpRuntime.AppDomainAppPath + "\\file2.xml"); var root = doc.SelectSingleNode("*"); FindMatchingNodesFromXml(root, nounPhrases); foreach (var item in nouns.ToList()) { foreach (var VARIABLE in matchedWords) { if (VARIABLE.Key.Contains(item)) { nouns.Remove(item); //Will work! } } } FindMatchingNodesFromXml(root, verbPhrases); // FindMatchingNodesFromXml(root, nouns); System.Diagnostics.Debug.WriteLine("\nProcessing verb phrases "); System.Diagnostics.Debug.WriteLine("\nProcessing nouns "); // construct the dictionary object and open it var directory = Directory.GetCurrentDirectory() + "\\wordnet\\"; foreach (var variable in matchedWords) { System.Diagnostics.Debug.WriteLine(variable.Value + "\t\t" + variable.Key); } foreach (var variable in matchedWords) { string a = variable.Key; if (line.Contains(a)) { line = line.replace(a, ""); } } foreach (var variable in stopWordsofwordnet) { string a = " " + variable.toLowerCase() + " "; if (line.Contains(a)) { line = line.replace(a, " "); } } if (line.contains(".")) { line = line.replace(".", ""); } if (line.contains("-")) { line = line.replace("-", " "); } System.Diagnostics.Debug.WriteLine("/////////////"); System.Diagnostics.Debug.WriteLine("SECOND PARSE STRING " + line); System.Diagnostics.Debug.WriteLine("/////////////"); line = line.Trim(); topParses = ParserTool.parseLine(line, myParser, 1); nounPhrases?.Clear(); dates?.Clear(); verbPhrases?.Clear(); nouns?.Clear(); foreach (var p in topParses) { //p.show(); GetSentenceParts(p); } FindMatchingNodesFromXml(root, nounPhrases); foreach (var item in nouns.ToList()) { foreach (var VARIABLE in matchedWords) { if (VARIABLE.Key.Contains(item)) { nouns.Remove(item); //Will work! } } } FindMatchingNodesFromXml(root, verbPhrases); FindMatchingNodesFromXml(root, nouns); tokens = tokenizer.tokenize(line); nameSpans = nameFinder.find(tokens); array = Span.spansToStrings(nameSpans, tokens); dates = new HashSet <string>(array); PrintSets(); System.Diagnostics.Debug.WriteLine("\nProcessing Dates"); if (dates.Count == 2) { if (dates.ElementAt(0).contains("from")) { var a = dates.ElementAt(0).replace("from", ""); List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(a, newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(1), newList); } else { List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(dates.ElementAt(0), newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(1), newList); } } if (dates.Count == 1) { if (dates.ElementAt(0).contains("from")) { var a = dates.ElementAt(0).replace("from", ""); var dts = a.Split(new[] { " to " }, StringSplitOptions.None); List <string> newList = new List <string>(); newList.Add("START_PERIOD"); matchedWords.Add(dts[0], newList); newList = new List <string>(); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dts[1], newList); } else { List <string> newList = new List <string>(); newList.Add("START_PERIOD"); newList.Add("END_PERIOD"); //todo fix when the date is the same here matchedWords.Add(dates.ElementAt(0), newList); } } System.Diagnostics.Debug.WriteLine("\nProcessing noun phrases"); // var manager = new Manager(); // var serializer = new XmlSerializer(typeof(Manager.language)); // var loadStream = new FileStream("file2.xml", FileMode.Open, FileAccess.Read); // var loadedObject = (Manager.language) serializer.Deserialize(loadStream); FindMatchingNodesFromXml(root, nounPhrases); FindMatchingNodesFromXml(root, verbPhrases); FindMatchingNodesFromXml(root, nouns); foreach (var variable in matchedWords) { System.Diagnostics.Debug.WriteLine(variable.Value + "\t\t" + variable.Key); } doc = null; GC.Collect(); GC.WaitForPendingFinalizers(); //MATCHING WITH WORD NET System.Diagnostics.Debug.WriteLine(directory); // var wordNet = new WordNetEngine(); // // wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.adj")), PartOfSpeech.Adjective); // wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.adv")), PartOfSpeech.Adverb); // wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.noun")), PartOfSpeech.Noun); // wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.verb")), PartOfSpeech.Verb); // // wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.adj")), PartOfSpeech.Adjective); // wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.adv")), PartOfSpeech.Adverb); // wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.noun")), PartOfSpeech.Noun); // wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.verb")), PartOfSpeech.Verb); // // System.Diagnostics.Debug.WriteLine("Loading database..."); // wordNet.Load(); // System.Diagnostics.Debug.WriteLine("Load completed."); // while (true) // { // System.Diagnostics.Debug.WriteLine("\nType first word"); // // var word = System.Diagnostics.Debug.ReadLine(); // var synSetList = wordNet.GetSynSets(word); // // if (synSetList.Count == 0) System.Diagnostics.Debug.WriteLine($"No SynSet found for '{word}'"); // // foreach (var synSet in synSetList) // { // var words = string.Join(", ", synSet.Words); // // System.Diagnostics.Debug.WriteLine($"\nWords: {words}"); // } // } } catch (IOException e) { e.printStackTrace(); } finally { if (modelIn1 != null) { try { modelIn1.close(); } catch (IOException e) { } } if (modelIn2 != null) { try { modelIn2.close(); } catch (IOException e) { } } // truncateLists(ref nounPhrases); // truncateLists(ref nouns); // truncateLists(ref dates); // truncateLists(ref verbPhrases); } return(matchedWords); }
public Tokenizer(TokenizerModel model) { this.tokenizer = new TokenizerME(model); }