Пример #1
0
        static void Main(string[] args)
        {
            var jarRoot = @"stanford-ner-2016-10-31";
            var classifiersDirecrory = jarRoot + @"\classifiers";


            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            var rawFileNames    = Directory.GetFiles(@"Texts");
            var markedFileNames = Directory.GetFiles(@"MarkedTexts");

            for (int i = 0; i < rawFileNames.Length; ++i)
            {
                using (var rawReader = new StreamReader(rawFileNames[i]))
                    using (var markedReader = new StreamReader(markedFileNames[i]))
                    {
                        string rawText         = rawReader.ReadToEnd();
                        string rightMarkedText = markedReader.ReadToEnd();

                        var markedText = classifier.classifyWithInlineXML(rawText);
                        Console.WriteLine($"File Name: {Path.GetFileName(rawFileNames[i])}\n");
                        Console.WriteLine($"{markedText}\n\n");

                        Console.WriteLine($"{rightMarkedText}\n");
                    }
            }
        }
Пример #2
0
        public string getNER(string S)
        {
            CRFClassifier Classifier = CRFClassifier.getClassifierNoExceptions(@"C:\english.all.3class.distsim.crf.ser.gz");

            //S = "David go to school at Stanford University, which is located in California.";
            string S3 = S.Trim(new Char[] { ',', '.' });
            string S2 = S3.Replace(@",", "");
            //  Console.WriteLine(S2);
            String classify = Classifier.classifyToString(S2);

            string[] words  = classify.Split(' ');
            string   result = "";

            //List<String> iList = new List<String>();ctory

            //List<String> iList = new List<String>();
            foreach (string s in words)
            {
                if (!s.EndsWith("/O"))
                {
                    //System.Console.WriteLine(s);
                    result = result + s + "\n";
                }
            }

            // Keep the console window open in debug mode.

            return(result);
        }
Пример #3
0
        public void ExtractNeFromFile()
        {
            var filePath    = Files.NER.Classifier("english.all.3class.distsim.crf.ser.gz");
            var classifier  = CRFClassifier.getClassifierNoExceptions(filePath);
            var fileContent = System.IO.File.ReadAllText(Files.DataFile("SampleText.txt"));

            var sentences = classifier.classify(fileContent).toArray();

            Assert.NotNull(sentences);

            var key = new CoreAnnotations.AnswerAnnotation().getClass();

            foreach (java.util.List rawSentence in sentences)
            {
                var sentence = rawSentence.toArray();
                Assert.NotNull(sentence);

                foreach (CoreLabel word in sentence)
                {
                    var annotation = word.get(key);
                    Assert.NotNull(annotation);

                    TestContext.Out.WriteLine($"{word.word()}/{annotation}");
                }
                TestContext.Out.WriteLine();
            }
        }
        public string Location()
        {
            //where the nlp library is stored
            var source = @"C:\Users\chris\Downloads\stanford-ner-2018-02-27\stanford-ner-2018-02-27\classifiers\english.all.3class.distsim.crf.ser.gz";

            //declare a location string that use later of for identify words such as (Nicosia,Limassol)
            const string location = "LOCATION";

            var classifier = CRFClassifier.getClassifierNoExceptions(source);
            int a          = 0;

            String[] words_array = answer.Split(' ');
            String   output      = "";

            String[] Array = classifier.classifyToString(answer).Split(' ');
            while (a < words_array.Length)
            {
                output = Array[a].ToString();

                if (output.Contains(location))
                {
                    Console.WriteLine(Array[a]);
                    return(words_array[a]);
                }
                else
                {
                    return(words_array[a]);
                }
            }

            return(null);
        }
Пример #5
0
 public static CRFClassifier GetClassifierByLang(string lang)
 {
     if (!classifiers.ContainsKey(lang))
     {
         classifiers.Add(lang,
                         CRFClassifier.getClassifierNoExceptions(classifiersDirectory + StanfordEnv.GetNerLanguageFiles(lang)));
     }
     return(classifiers[lang]);
 }
        private async Task Load3rdParty()
        {
            Loading.Visibility = Visibility.Visible;
            _ocr        = new TesseractEngine("./tessdata", "eng", EngineMode.Default);
            _classifier = await Task.Run(() => CRFClassifier.getClassifierNoExceptions(@"english.all.3class.distsim.crf.ser.gz"));

            RunButton.Content   = "Run";
            RunButton.IsEnabled = true;
            Loading.Visibility  = Visibility.Collapsed;
        }
Пример #7
0
 /// <summary>
 /// Initializes a new instance of the <see cref="Preprocessor" /> class.
 /// </summary>
 public Preprocessor()
 {
     listLatestTokenizedArticle = new List <Token>();
     listWhoCandidates          = new List <Candidate>();
     listWhenCandidates         = new List <Candidate>();
     listWhereCandidates        = new List <Candidate>();
     listWhatCandidates         = new List <List <Token> >();
     listWhyCandidates          = new List <List <Token> >();
     nerClassifier = CRFClassifier.getClassifierNoExceptions(nerModelPath);
     posTagger     = new MaxentTagger(posModelPath);
 }
Пример #8
0
 public NER()
 {
     try
     {
         string root = @"D:\Temp\NER\classifiers";
         Classifier = CRFClassifier.getClassifierNoExceptions(root + @"\english.all.3class.distsim.crf.ser.gz");
     }
     catch (Exception ex)
     {
         Console.WriteLine(ex.ToString());
     }
 }
Пример #9
0
        private void Rectangle_MouseLeftButtonDown_2(object sender, MouseButtonEventArgs e)
        {
            var jarRoot = @"D:\stanford-ner-2018-10-16";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            conn.Open();
            cmd    = new OleDbCommand("SELECT * From articles", conn);
            reader = cmd.ExecuteReader();
            while (reader.Read())
            {
                var           s1          = reader["TNote"].ToString();
                var           s2          = classifier.classifyWithInlineXML(s1);
                List <string> words       = s2.Split(' ', ',', '<', '>').ToList();
                List <string> person      = new List <string>();
                int           count       = 0;
                int           count1      = 0;
                bool          isTagPerson = false;
                foreach (var word in words)
                {
                    if (word == "/PERSON")
                    {
                        isTagPerson = false;
                        count1     += 1;
                    }

                    if (isTagPerson)
                    {
                        person[count1] = person[count1] + word + " ";
                    }
                    if (word == "PERSON")
                    {
                        isTagPerson = true;
                        person.Add("");
                    }


                    count += 1;
                }

                for (int i = 0; i < count1; i++)
                {
                    OleDbCommand cmd2 = new OleDbCommand("Insert Into AllNames(PName) Values(@PN)", conn);
                    cmd2.Parameters.AddWithValue("PN", person[i]);
                    cmd2.ExecuteNonQuery();
                }
            }
            conn.Close();
        }
Пример #10
0
        static void Main(string[] args)
        {
            var propPath  = @"..\..\train.prop";
            var modelPath = @"..\..\ner-model.ser.gz";

            TrainAndWrite(propPath, modelPath);

            var crf = CRFClassifier.getClassifierNoExceptions(modelPath);

            String[] tests = new String[] { "apple watch", "samsung mobile phones", " lcd 52 inch tv" };

            foreach (String item in tests)
            {
                DoTagging(crf, item);
            }
        }
Пример #11
0
        private static string GetNLPResults(string story)
        {
            string results;

            // Path to the folder with classifiers models
            string baseDirectory        = AppDomain.CurrentDomain.BaseDirectory;
            string classifiersDirectory = baseDirectory + @"..\DirectSupply.Anonymize.Service\Models\NLP";

            // Loading 3 class classifier model
            CRFClassifier classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirectory + @"\english.all.3class.distsim.crf.ser.gz");

            results = classifier.classifyWithInlineXML(story);

            return(results);
        }
Пример #12
0
        private async Task Init()
        {
            // Path to the folder with classifiers models
            var jarRoot = @"C:\stanford-ner-2018-10-16";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            _classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            // Define a regular expression for finding the location element
            _locationRx = new Regex(@"<LOCATION\b[^>]*>(.*?)</LOCATION>",
                                    RegexOptions.Compiled | RegexOptions.IgnoreCase);

            // Define configurations for parsing artist and listener info
            var configArtistInfoJson = @"
            {
                'artist': '//h1[contains(@class, \'view-header\')]',
                'about': '//div[contains(@class, \'bio-primary\')]',
                'more': '//div[contains(@class, \'bio-secondary\')]',
                'listeners-city': '//span[contains(@class, \'horizontal-list__item__title\')]',
                'listeners': '//span[contains(@class, \'horizontal-list__item__subtitle\')]'
            }";

            ConfigSection configArtist = StructuredDataConfig.ParseJsonString(configArtistInfoJson);

            _artistScraping = new StructuredDataExtractor(configArtist);

            // Get the hosted feature layers for editing
            ArcGISPortal portal = await ArcGISPortal.CreateAsync();

            PortalItem hometownLayerItem = await PortalItem.CreateAsync(portal, _hometownLayerId);

            PortalItem otherPointsLayerItem = await PortalItem.CreateAsync(portal, _otherPointsLayerId);

            PortalItem listenerLayerItem = await PortalItem.CreateAsync(portal, _listenerLayerId);

            _hometownTable    = new ServiceFeatureTable(hometownLayerItem, 0);
            _otherPointsTable = new ServiceFeatureTable(otherPointsLayerItem, 0);
            _listenerTable    = new ServiceFeatureTable(listenerLayerItem, 0);
            await _hometownTable.LoadAsync();

            await _otherPointsTable.LoadAsync();

            await _listenerTable.LoadAsync();
        }
Пример #13
0
        public string classifyToString(string sentence, string outputFormat = "slashTags")
        {
            // Loading classes classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(classifiersDirecrory + model);

            return(classifier.classifyToString(sentence, outputFormat, true));
            //.WriteLine("{0}\n", classifier.classifyToString(sentence, outputFormat, true));

            //var classified = classifier.classifyToCharacterOffsets(s1).toArray();

            //for (int i = 0; i < classified.Length; i++)
            //{
            //    Triple triple = (Triple)classified[i];

            //    int second = Convert.ToInt32(triple.second().ToString());
            //    int third = Convert.ToInt32(triple.third().ToString());

            //    Console.WriteLine(triple.first().ToString() + '\t' + s1.Substring(second, third - second));
            //}
        }
Пример #14
0
        static void Main()
        {
            // Path to the folder with classifies models
            var jarRoot = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-ner-2018-02-27";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            var s1 = "Good afternoon Rajat Raina, how are you today?";

            Console.WriteLine("{0}\n", classifier.classifyToString(s1));

            var s2 = "I go to school at Stanford University, which is located in California.";

            Console.WriteLine("{0}\n", classifier.classifyWithInlineXML(s2));

            Console.WriteLine("{0}\n", classifier.classifyToString(s2, "xml", true));
        }
Пример #15
0
        static void Main(string[] args)
        {
            var classifiersDirecrory = Environment.CurrentDirectory + @"\stanford-ner-2016-10-31\classifiers";

            // Loading 7 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(Path.Combine(classifiersDirecrory, "english.muc.7class.distsim.crf.ser.gz"));

            //Load the document that needs to be recognized with Named Entities contained in it.
            var document = File.ReadAllText(Path.Combine(Environment.CurrentDirectory, "demo.txt"));

            //Use the classifyToString method with the document loaded as the 1st argument and
            //give "xml" as 2nd argument for output format with preserveSpacing set to true as 3rd argument
            string xmlContent = classifier.classifyToString(document, "xml", true);

            //Wrap the reslutant xmlContent with parent and WiCollection tag.
            //This is just for XML Deserialization that eases to perform LINQ operations
            var xml = $"<WiCollection><parent>{xmlContent}</parent></WiCollection>";

            //POCO to hold DeSerialized XML
            WiCollection wiCollection = null;

            //Deserialize XML
            XmlSerializer serializer = new XmlSerializer(typeof(WiCollection));

            using (TextReader reader = new StringReader(xml))
            {
                wiCollection = (WiCollection)serializer.Deserialize(reader);
            }

            //Iterate and print standard Entity types
            var entities = Enum.GetValues(typeof(Entity)).Cast <Entity>();

            foreach (var entity in entities)
            {
                Console.WriteLine($"-----------{entity}-----------");
                var tags = wiCollection.Wi.Where(x => x.Entity.ToLowerInvariant() == entity.ToString().ToLowerInvariant());

                Console.WriteLine(string.Join(Environment.NewLine, tags.Select(w => w.Text)));
            }
            Console.Read();
        }
Пример #16
0
        public void ExtractNeFromPredefinedPhrase()
        {
            var filePath   = Files.NER.Classifier("english.all.3class.distsim.crf.ser.gz");
            var classifier = CRFClassifier.getClassifierNoExceptions(filePath);

            var s1 = "Good afternoon Rajat Raina, how are you today?";
            var s2 = "I go to school at Stanford University, which is located in California.";

            TestContext.Out.WriteLine(classifier.classifyToString(s1));
            TestContext.Out.WriteLine(classifier.classifyWithInlineXML(s2));
            TestContext.Out.WriteLine(classifier.classifyToString(s2, "xml", true));

            var labels = classifier.classify(s2).toArray();

            Assert.NotNull(labels);

            for (var i = 0; i < labels.Length; i++)
            {
                TestContext.Out.WriteLine($"{i}\n:{labels[i]}");
            }
        }
        public List <EntityRecognition> ExtractionChannel(string message)
        {
            // find people
            List <EntityRecognition> entity = new List <EntityRecognition>();

            // Path to the folder with classifiers models
            //var jarRoot = @"C:\\Users\\Sai\\Downloads\\stanford-ner-2016-10-31\\stanford-ner-2016-10-31";
            //var classifiersDirecrory = jarRoot + @"\classifiers";

            string filepath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Content\\classifier\\english.all.3class.distsim.crf.ser.gz");

            var      classifier = CRFClassifier.getClassifierNoExceptions(filepath);
            TextInfo textInfo   = new CultureInfo("en-US", false).TextInfo;

            message = textInfo.ToTitleCase(message);
            string result = classifier.classifyWithInlineXML(message);

            entity = getEntityList(result);

            return(entity);
        }
Пример #18
0
        public string[] GetPlaces(string text)
        {
            var classifiersDirectory = Environment.CurrentDirectory + @"\stanford-ner-2016-10-31\classifiers";
            var classifier           = CRFClassifier.getClassifierNoExceptions(Path.Combine(classifiersDirectory, "english.muc.7class.distsim.crf.ser.gz"));
            var xmlContent           = classifier.classifyWithInlineXML(text);

            var xmlDoc = new XmlDocument();

            xmlDoc.LoadXml($"<xml>{xmlContent}</xml>");

            var placesElements = xmlDoc.GetElementsByTagName("LOCATION");
            var places         = new List <string>();

            for (var i = 0; i < placesElements.Count; i++)
            {
                if (placesElements[i].InnerXml.Length > 0)
                {
                    places.Add(placesElements[i].InnerXml);
                }
            }

            return(places.Select(place => place.ToString()).ToArray());
        }
Пример #19
0
        public static CRFClassifier GetClassifier()
        {
            if (_classifier != null)
            {
                return(_classifier);
            }

            var modelDir = new DirectoryInfo(Path.Combine(GetExecDirLocation(), "Models"));

            if (!modelDir.Exists)
            {
                throw new Exception(string.Format("The required NER model directory {0} is missing", modelDir.FullName));
            }
            var modelFile = new FileInfo(Path.Combine(modelDir.FullName, "english.muc.7class.distsim.crf.ser.gz"));

            if (!modelFile.Exists)
            {
                throw new Exception(string.Format("The required NER model file {0} is missing", modelFile.Name));
            }

            _classifier = CRFClassifier.getClassifierNoExceptions(modelFile.FullName);
            return(_classifier);
        }
Пример #20
0
        static void Main()
        {
            // Path to the folder with classifies models
            var jarRoot = @"E:\SystemsTests\stanford\stanford-ner-2015-04-20";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");
            var text =
                System.IO.File.ReadAllText(
                    @"C:\code\github\LingProductsTests\TestResults\strict\eng\2talk\testmodeleng_etalon.txt");
            var parser = new HtmlCorpusParser();
            var model  = parser.Parse(text, 350);

            var input = model.ClearedText;
            var timer = Stopwatch.StartNew();

            System.IO.File.WriteAllText("stanford.txt", classifier.classifyWithInlineXML(input));
            timer.Stop();
            Console.WriteLine("Elapsed {0} ms", timer.ElapsedMilliseconds);
            Console.ReadLine();
        }
Пример #21
0
        public string GetData(string value)
        {
            var jarRoot2             = @"C:\Workspace\Bofa\NLP\stanford-ner-2015-04-20";
            var classifiersDirecrory = jarRoot2 + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            XmlDocument    doc            = new XmlDocument();
            XmlDeclaration xmlDeclaration = doc.CreateXmlDeclaration("1.0", "UTF-8", null);

            //XmlElement root = doc.DocumentElement;
            //doc.InsertBefore(xmlDeclaration, root);
            doc.LoadXml(string.Format("<root>{0}</root>", classifier.classifyToString(value, "xml", true)));
            // doc.InnerXml = ;

            XmlNodeList   orgs = doc.SelectNodes("root/wi[@entity='ORGANIZATION']");
            XmlNodeList   ppl  = doc.SelectNodes("root/wi[@entity='PERSON']");
            XmlNodeList   locs = doc.SelectNodes("root/wi[@entity='LOCATION']");
            StringBuilder sd   = new StringBuilder();

            foreach (XmlNode xmlN in orgs)
            {
                sd.Append(xmlN.InnerText + ", ");
            }
            foreach (XmlNode xmlN in ppl)
            {
                sd.Append(xmlN.InnerText + ", ");
            }
            foreach (XmlNode xmlN in locs)
            {
                sd.Append(xmlN.InnerText + ", ");
            }
            return(sd.ToString());
        }
Пример #22
0
        static void setupPipeline()
        {
            DateTimeOffset started = DateTimeOffset.UtcNow;
            var            classifiersDirectory = @"C:/temp/stanford-english-corenlp/edu/stanford/nlp/models/ner";

            // Loading 3 class classifier model
            classifier = CRFClassifier.getClassifierNoExceptions(classifiersDirectory + @"\english.all.3class.distsim.crf.ser.gz");


            // Path to the folder with models extracted from `stanford-corenlp-3.8.0-models.jar`
            var jarRoot = @"c:/temp/stanford-english-corenlp/";
            // Annotation pipeline configuration
            var props = new Properties();

            props.setProperty("annotators", "tokenize, ssplit, pos, parse, sentiment");
            props.setProperty("ner.useSUTime", "0");

            var curDir = Environment.CurrentDirectory;

            Directory.SetCurrentDirectory(jarRoot);
            pipeline = new StanfordCoreNLP(props);
            Directory.SetCurrentDirectory(curDir);
            Console.WriteLine("finished up : " + (DateTimeOffset.UtcNow - started).TotalSeconds + " seconds");
        }
Пример #23
0
 private CRFClassifier GetNerClassifier()
 {
     return(CRFClassifier.getClassifierNoExceptions(nlpConfig.JarRoot + nlpConfig.ClassifiersDirectory + nlpConfig.Ner3ClassesModel));
 }
        public void ProcessInputData()
        {
            Console.WriteLine("Processing input file...");

            dbPediaAccess dbPediaChecker = new dbPediaAccess();

            var    jarRoot = @"C:\Users\micha\source\repos\ConsoleApp2\ConsoleApp2\data\paket-files\nlp.stanford.edu\stanford-ner-2016-10-31";
            var    classifiersDirecrory = jarRoot + @"\classifiers";
            var    classifier           = CRFClassifier.getClassifierNoExceptions(classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");
            IGraph g = new Graph();

            new TurtleParser().Load(g, inputPath);
            var triples = g.Triples.Where(q => q.Predicate.ToString().ToLower().Contains("isString".ToLower())).GroupBy(q => q.Object).Select(q => q.First());


            //https://csharp.hotexamples.com/examples/VDS.RDF.Parsing/TurtleParser/-/php-turtleparser-class-examples.html\
            //linq microsoft, regex
            var dataSet = new List <data.Input>();

            foreach (var t in triples)
            {
                var match = new Regex("char=([0-9]+),([0-9]+)").Match(t.Subject.ToString());
                if (!match.Success)
                {
                    continue;
                }
                int startIndex = 0;
                int stopIndex  = int.Parse(match.Groups[2].Value);

                var newData = new data.Input(t.Object.ToString().Substring(startIndex, stopIndex));
                dataSet.Add(newData);
            }

            //http://sergey-tihon.github.io/Stanford.NLP.NET/StanfordNER.html

            Console.WriteLine();

            string classifierOutput = "";

            for (int cnt = 0; cnt < dataSet.Count(); cnt++)
            {
                classifierOutput = classifier.classifyWithInlineXML(dataSet[cnt].sentence);
                // Console.WriteLine("{0}\n", classifierOutput);
                Console.WriteLine(dataSet[cnt].sentence + "\n");
                EntityExtractor(classifierOutput, "LOCATION");
                EntityExtractor(classifierOutput, "ORGANIZATION");
                EntityExtractor(classifierOutput, "PERSON");

                if (locationEntityL.Count > 0)
                {
                    locationEntityL.Distinct();
                    Console.WriteLine("Locations:");
                    for (int i = 0; i < locationEntityL.Count; i++)
                    {
                        Console.WriteLine(locationEntityL[i]);
                        dbPediaChecker.checkdBPedia("Place", locationEntityL[i]);
                    }
                    Console.WriteLine();
                    locationEntityL.Clear();
                }

                if (organizationEntityL.Count > 0)
                {
                    organizationEntityL.Distinct();
                    Console.WriteLine("Organisations:");
                    for (int i = 0; i < organizationEntityL.Count; i++)
                    {
                        Console.WriteLine(organizationEntityL[i]);
                        dbPediaChecker.checkdBPedia("Organisation", organizationEntityL[i]);
                    }
                    Console.WriteLine();
                    organizationEntityL.Clear();
                }

                if (personEntityL.Count > 0)
                {
                    personEntityL.Distinct();
                    Console.WriteLine("Persons:");
                    for (int i = 0; i < personEntityL.Count; i++)
                    {
                        Console.WriteLine(personEntityL[i]);
                        dbPediaChecker.checkdBPedia("Person", personEntityL[i]);
                    }
                    Console.WriteLine();
                    personEntityL.Clear();
                }
            }


            //    Console.WriteLine("{0}\n", classifier.classifyToString(dataSet.First().sentence));
            // (

            //var s1 = "Good afternoon Rajat Raina, how are you today?";
            //Console.WriteLine("{0}\n", classifier.classifyToString(s1));

            //var s2 = "I go to school at Stanford University, which is located in California.";
            //Console.WriteLine("{0}\n", classifier.classifyWithInlineXML(s2));

            //var s3 = "Michael Jackson and Donald Trump never met in New York.";
            //Console.WriteLine("{0}\n", classifier.classifyWithInlineXML(s3));

            //Console.WriteLine("{0}\n", classifier.classifyToString(s2, "xml", true));

            Console.ReadKey();
        }
Пример #25
0
        /// <summary>
        /// Output name entity set of each tweet cluster
        /// Output: clusterNameEntitySet.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        public static void nameEntitySet(string fileName)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            StreamReader sr          = new StreamReader("signalCluster.txt", Encoding.Default);
            StreamReader sr1         = new StreamReader("generalCluster.txt", Encoding.Default);
            FileStream   fs          = new FileStream("clusterNameEntitySet.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);

            // Path to the folder with classifiers models
            var jarRoot = @"..\..\..\..\stanford-ner-2015-12-09";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            string line;
            string line1;

            while ((line = sr.ReadLine()) != null && (line1 = sr1.ReadLine()) != null)
            {
                line  = sr.ReadLine();
                line1 = sr1.ReadLine();
                sr.ReadLine();
                sr1.ReadLine();

                string[]   iDocStrArray = Regex.Split(line, " ");
                List <int> iDocList     = new List <int>();
                for (int i = 0; i < iDocStrArray.Length - 1; i++)
                {
                    iDocList.Add(int.Parse(iDocStrArray[i]));
                }

                string[]   iDocStrArray1 = Regex.Split(line1, " ");
                List <int> iDocList1     = new List <int>();
                for (int i = 0; i < iDocStrArray1.Length - 1; i++)
                {
                    iDocList1.Add(int.Parse(iDocStrArray1[i]));
                }

                HashSet <string> nameEntitySet = new HashSet <string>();

                for (int i = 0; i < iDocList.Count; i++)
                {
                    Document inDoc = indexReader.Document(iDocList[i]);
                    string   text  = inDoc.Get("Text");
                    text = Regex.Replace(text, @"\s+", " ");
                    text = Regex.Replace(text, @"#n#|#N#", "");
                    text = Regex.Replace(text, @"#", "");
                    text = Regex.Replace(text, @"@", "");
                    text = classifier.classifyWithInlineXML(text);
                    MatchCollection mc;
                    mc = Regex.Matches(text, @"<PERSON>[^<>]+</PERSON>");
                    var it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(8, str.Length - 17));
                    }
                    mc = Regex.Matches(text, @"<ORGANIZATION>[^<>]+</ORGANIZATION>");
                    it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(14, str.Length - 29));
                    }
                    mc = Regex.Matches(text, @"<LOCATION>[^<>]+</LOCATION>");
                    it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(10, str.Length - 21));
                    }
                }

                for (int i = 0; i < iDocList1.Count; i++)
                {
                    Document inDoc = indexReader.Document(iDocList1[i]);
                    string   text  = inDoc.Get("Text");
                    text = Regex.Replace(text, @"\s+", " ");
                    text = Regex.Replace(text, @"#n#|#N#", "");
                    text = Regex.Replace(text, @"#", "");
                    text = Regex.Replace(text, @"@", "");
                    text = classifier.classifyWithInlineXML(text);
                    MatchCollection mc;
                    mc = Regex.Matches(text, @"<PERSON>[^<>]+</PERSON>");
                    var it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(8, str.Length - 17));
                    }
                    mc = Regex.Matches(text, @"<ORGANIZATION>[^<>]+</ORGANIZATION>");
                    it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(14, str.Length - 29));
                    }
                    mc = Regex.Matches(text, @"<LOCATION>[^<>]+</LOCATION>");
                    it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(10, str.Length - 21));
                    }
                }

                var iter = nameEntitySet.GetEnumerator();
                for (int i = 0; i < nameEntitySet.Count; i++)
                {
                    iter.MoveNext();
                    sw.Write(iter.Current.ToString() + "; ");
                }

                sw.WriteLine();
            }

            sw.Close();
            fs.Close();
            sr1.Close();
            sr.Close();
        }
Пример #26
0
        public void useModel(string inputPath, string partial_address, ref NERAddress addr)
        {
            CRFClassifier model = CRFClassifier.getClassifierNoExceptions(inputPath);

            //string tagged_address = model.classifyToString(partial_address);
            string tagged_address = model.classifyWithInlineXML(partial_address);

            tagged_address = tagged_address.Replace("<0>", "<ZERO>");
            tagged_address = tagged_address.Replace("</0>", "</ZERO>");

            // parse xml
            XmlDocument doc = new XmlDocument();

            try
            {
                doc.LoadXml("<root>" + tagged_address + "</root>");
            }
            catch (XmlException e)
            {
                Console.WriteLine("Exception occurred while parsing xml: " + e.Message);
                return;
            }

            Console.WriteLine("Model output: " + tagged_address);

            string numbers = "";

            foreach (XmlNode node in doc.DocumentElement.ChildNodes)
            {
                if (Regex.IsMatch(node.Name, "^[IOB]-LOCALITY"))
                {
                    addr.locality += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-SECONDARY_LOCALITY"))
                {
                    addr.secondary_locality += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-THOROFARE"))
                {
                    addr.thorofare += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_GROUP_NAME"))
                {
                    addr.building_group_name += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_NAME"))
                {
                    addr.building_name += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-SUB_BUILDING_NAME"))
                {
                    addr.sub_building_name += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_NUMBER"))
                {
                    addr.building_number += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-DEPARTMENT"))
                {
                    addr.department += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-ORGANISATION_NAME"))
                {
                    addr.organisation_name += node.InnerText + " ";
                }

                if (node.Name == "NUMBER")
                {
                    numbers += node.InnerText + " ";
                }
            }

            addr.numbers = numbers.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);

            addr.locality            = addr.locality.Trim();
            addr.secondary_locality  = addr.secondary_locality.Trim();
            addr.thorofare           = addr.thorofare.Trim();
            addr.building_group_name = addr.building_group_name.Trim();
            addr.building_name       = addr.building_name.Trim();
            addr.sub_building_name   = addr.sub_building_name.Trim();
            addr.building_number     = addr.building_number.Trim();
            addr.department          = addr.department.Trim();
            addr.organisation_name   = addr.organisation_name.Trim();
        }
Пример #27
0
        public List <Book> getStreamFromTxtFile(String folderPath)
        {
            String         fileContent = "";
            List <string>  apts        = new List <string>();
            DirectoryInfo  d           = new DirectoryInfo(folderPath);
            HashSet <City> cities      = new HashSet <City>();
            List <Book>    books       = new List <Book>();
            var            client      = new MongoClient(new MongoUrl("mongodb://*****:*****@"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            string worldcities = System.IO.File.ReadAllText("/worldcities.csv");



            foreach (var file in Files)
            {
                string content = System.IO.File.ReadAllText(folderPath + "//" + file);


                var locations = classifier.classifyToString(content).Split('.', ' ', ',', '-', ':').Where(x => x.Contains("/LOCATION"));

                foreach (var item in locations)
                {
                    if (item.Contains("/LOCATION") && !item.Contains("/O"))
                    {
                        var newItem       = item.Replace("/LOCATION", "");
                        var splittedArray = worldcities.Split('\n').Where(x => x.Contains(newItem.Trim())).ToArray();

                        if (splittedArray.Length > 0)
                        {
                            foreach (var row in splittedArray)
                            {
                                var currentRow = row.Split(';').ToArray();
                                if (currentRow.Length > 2)
                                {
                                    if (currentRow[2].Trim() == newItem.Trim())
                                    {
                                        cities.Add(new City(newItem, currentRow[3], currentRow[4]));
                                        break;
                                    }
                                }
                            }
                        }
                    }
                }

                var vs = content.Split('\n').Where(x => x.Contains("Title:") || x.Contains("Author:")).ToArray();

                if (2 == vs.Length)
                {
                    collection.InsertOne(new Book(vs[0], vs[1], cities.ToList()));
                }
                else if (1 == vs.Length)
                {
                    collection.InsertOne(new Book(vs[0], "", cities.ToList()));
                }
                cities.Clear();
                Console.WriteLine(i++);

                //       Console.ReadLine();
            }



            return(books);
        }
Пример #28
0
        static void Main()
        {
            // Path to the folder with classifiers models
            var jarRoot = @"\Users\devir\OneDrive\Documents\Visual Studio 2015\Projects\ner";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.muc.7class.distsim.crf.ser.gz");

            var s1 = " She got up this morning at 9:00 am and went to a shop to spend five dollars to buy a 50% off toothbrush.";


            var s2 = "Tell the latest on olympics from the New York.";

            Console.WriteLine("{0}\n", classifier.classifyToCharacterOffsets(s1));
            Console.WriteLine("{0}\n", classifier.classifyWithInlineXML(s1));

            //MUNCULIN NER SATU SATU
            string result = classifier.classifyWithInlineXML(s1);
            String substr1 = "TIME";
            String substr2 = "LOCATION";
            String substr3 = "PERSON";
            String substr4 = "ORGANIZATION";
            String substr5 = "MONEY";
            String substr6 = "Percent";
            String substr7 = "Date";
            string total1, total2, total3, total4, total5, total6, total7;

            //if (result.Contains(substr1))
            //{
            //    string[] hasiltime = GetStringInBetween("<TIME>", "</TIME>", result, false, false);
            //    string output_time = hasiltime[0];
            //    string next_time = hasiltime[1];
            //    total1 = output_time;
            //   // Console.WriteLine(output_time);
            //}
            //if (result.Contains(substr2))
            //{
            //    string[] hasillocation = GetStringInBetween("<LOCATION>", "</LOCATION>", result, false, false);
            //    string output_location = hasillocation[0];
            //    string next_loc = hasillocation[1];
            //    //Console.WriteLine(output_location);
            //    total2 = output_location;
            //}
            //if (result.Contains(substr3))
            //{
            //    string[] hasilperson = GetStringInBetween("<PERSON>", "</PERSON>", result, false, false);
            //    string output_person = hasilperson[0];
            //    string next_person = hasilperson[1];
            //    //Console.WriteLine(hasilperson);
            //    total3 = output_person;
            //}
            //if (result.Contains(substr4))
            //{
            //    string[] hasilORGANIZATION = GetStringInBetween("<ORGANIZATION>", "</ORGANIZATION>", result, false, false);
            //    string output_ORGANIZATION = hasilORGANIZATION[0];
            //    string next_ORGANIZATION = hasilORGANIZATION[1];
            //    //Console.WriteLine(output_ORGANIZATION);
            //    total4 = output_ORGANIZATION;
            //}
            //if (result.Contains(substr5))
            //{
            //    string[] hasilMONEY = GetStringInBetween("<MONEY>", "</MONEY>", result, false, false);
            //    string output_MONEY = hasilMONEY[0];
            //    string next_MONEY = hasilMONEY[1];
            //    // Console.WriteLine(output_MONEY);
            //    total5 = output_MONEY;
            //}
            //if (result.Contains(substr6))
            //{
            //    string[] hasilPercent = GetStringInBetween("<Percent>", "</Percent>", result, false, false);
            //    string output_Percent = hasilPercent[0];
            //    string next_Percent = hasilPercent[1];
            //    //Console.WriteLine(output_Percent);
            //    total6 = output_Percent;
            //}
            //if (result.Contains(substr7))
            //{
            //    string[] hasilDate = GetStringInBetween("<Date>", "</Date>", result, false, false);
            //    string output_Date = hasilDate[0];
            //    string next_Date = hasilDate[1];
            //    //Console.WriteLine(output_Date);
            //    total7 = output_Date;

            //}

            string[] hasiltime   = GetStringInBetween("<TIME>", "</TIME>", result, false, false);
            string   output_time = hasiltime[0];
            string   next_time   = hasiltime[1];

            total1 = output_time;
            //Console.WriteLine(output_time);

            string[] hasillocation   = GetStringInBetween("<LOCATION>", "</LOCATION>", result, false, false);
            string   output_location = hasillocation[0];
            string   next_loc        = hasillocation[1];

            //Console.WriteLine(output_location);
            total2 = output_location;

            string[] hasilperson   = GetStringInBetween("<PERSON>", "</PERSON>", result, false, false);
            string   output_person = hasilperson[0];
            string   next_person   = hasilperson[1];

            //Console.WriteLine(hasilperson);
            total3 = output_person;

            string[] hasilORGANIZATION   = GetStringInBetween("<ORGANIZATION>", "</ORGANIZATION>", result, false, false);
            string   output_ORGANIZATION = hasilORGANIZATION[0];
            string   next_ORGANIZATION   = hasilORGANIZATION[1];

            //Console.WriteLine(output_ORGANIZATION);
            total4 = output_ORGANIZATION;

            string[] hasilMONEY   = GetStringInBetween("<MONEY>", "</MONEY>", result, false, false);
            string   output_MONEY = hasilMONEY[0];
            string   next_MONEY   = hasilMONEY[1];

            // Console.WriteLine(output_MONEY);
            total5 = output_MONEY;

            string[] hasilPercent   = GetStringInBetween("<Percent>", "</Percent>", result, false, false);
            string   output_Percent = hasilPercent[0];
            string   next_Percent   = hasilPercent[1];

            //Console.WriteLine(output_Percent);
            total6 = output_Percent;

            string[] hasilDate   = GetStringInBetween("<Date>", "</Date>", result, false, false);
            string   output_Date = hasilDate[0];
            string   next_Date   = hasilDate[1];

            //Console.WriteLine(output_Date);
            total7 = output_Date;


            //BOW
            string semua = total1 + ";" + total2 + ";" + total3 + ";" + total4 + ";" + total5 + ";" + total6 + ";" + total7 + ";";

            Console.WriteLine(semua);
            string[] gabungan = { total1, total2, total3, total4, total5, total6, total7 };

            foreach (var a in gabungan)
            {
                Console.WriteLine(a);
            }
            string[][] words = gabungan.Tokenize();
            //var codebook = new TFIDF()
            //{
            //    Tf = TermFrequency.Log,
            //    Idf = InverseDocumentFrequency.Default
            //};
            var codebook = new BagOfWords()
            {
                MaximumOccurance = 1 // the resulting vector will have only 0's and 1's
            };

            codebook.Learn(words);
            double[]   bow1            = codebook.Transform(words[0]);
            double[]   bow2            = codebook.Transform(words[1]);
            double[]   bow3            = codebook.Transform(words[2]);
            double[]   bow4            = codebook.Transform(words[3]);
            double[]   bow5            = codebook.Transform(words[4]);
            double[]   bow6            = codebook.Transform(words[5]);
            double[]   bow7            = codebook.Transform(words[6]);
            double[][] keseluruhanBOW1 = { bow1, bow2, bow3, bow4, bow5, bow6, bow7 };

            //coba
            bool quitNow = false;

            while (!quitNow)
            {
                string val;
                Console.Write("Enter question: ");
                val = Console.ReadLine();
                string[] textss =
                {
                    val,
                };



                string[][] wordss = textss.Tokenize();
                //var codebook2 = new TFIDF()
                //{
                //    Tf = TermFrequency.Log,
                //    Idf = InverseDocumentFrequency.Default
                //};
                var codebook2 = new BagOfWords()
                {
                    MaximumOccurance = 1 // the resulting vector will have only 0's and 1's
                };
                codebook2.Learn(wordss);
                double[] c1   = codebook2.Transform(wordss[0]);
                string   path = @"C:\Users\devir\OneDrive\Documents\Visual Studio 2015\Projects\ner";
                //var load_svm_model = Serializer.Load<MulticlassClassifierBase>(Path.Combine(path, "pelatihanSVMbayardanpergi.bin"));


                //LibSvmModel modela = LibSvmModel.Load(Path.Combine(path, "pelatihanSVMbayardanpergi.bint"));
                //int jawaban = load_svm_model.Decide( c1); // answer will be 2.
                // Now, we can use the model class to create the equivalent Accord.NET SVM:

                //Console.WriteLine(jawaban);
                LibSvmModel model = LibSvmModel.Load(Path.Combine(path, "pelatihanSVMbayardanpergi.txt"));

                // Now, we can use the model class to create the equivalent Accord.NET SVM:
                SupportVectorMachine svm = model.CreateMachine();

                // Compute classification error
                bool predicted = svm.Decide(c1);

                // var machine = teacher.Learn(inputs, outputs);

                if (predicted == false)
                {
                    Console.WriteLine("BAYAR");
                }
                ;
                if (predicted == true)
                {
                    Console.WriteLine("PERGI");
                }
                ;
                Console.ReadLine();
            }

            // In order to convert any 2d array to jagged one
            // let's use a generic implementation
        }
Пример #29
0
        public static Dictionary <string, string> ner(List <string> content)
        {
            Dictionary <string, string> EntityDict = new Dictionary <string, string>();
            // Path to the folder with classifiers models
            var jarRoot = @"C:\Users\Gideon\Documents\stanford-ner-2017-06-09";
            var classifiersDirecrory = jarRoot + @"\classifiers";
            // Loading 7 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.muc.7class.distsim.crf.ser.gz");
            // Applying ner tagging and saving to xml string
            string ner = "";

            foreach (var item in content)
            {
                ner += classifier.classifyWithInlineXML(item);
            }
            // Adding root
            ner = "<root>" + ner + "</root>";
            // Converting to Xml document
            System.Xml.XmlDocument xmlDoc = new System.Xml.XmlDocument();
            xmlDoc.LoadXml(ner);
            // Iterating inside Xml Document
            foreach (XmlNode item in xmlDoc.ChildNodes[0].ChildNodes)
            {
                var entity = item.Name;

                if (entity == "ORGANIZATION")
                {
                    if (!EntityDict.ContainsKey(item.InnerText))
                    {
                        EntityDict.Add(item.InnerText, entity);
                    }
                }
                if (entity == "LOCATION")
                {
                    if (!EntityDict.ContainsKey(item.InnerText))
                    {
                        EntityDict.Add(item.InnerText, entity);
                    }
                }
                if (entity == "PERSON")
                {
                    if (!EntityDict.ContainsKey(item.InnerText))
                    {
                        EntityDict.Add(item.InnerText, entity);
                    }
                }
                if (entity == "MONEY")
                {
                    if (!EntityDict.ContainsKey(item.InnerText))
                    {
                        EntityDict.Add(item.InnerText, entity);
                    }
                }
                if (entity == "DATE")
                {
                    if (!EntityDict.ContainsKey(item.InnerText))
                    {
                        EntityDict.Add(item.InnerText, entity);
                    }
                }
                if (entity == "PERCENT")
                {
                    if (!EntityDict.ContainsKey(item.InnerText))
                    {
                        EntityDict.Add(item.InnerText, entity);
                    }
                }
                if (entity == "TIME")
                {
                    if (!EntityDict.ContainsKey(item.InnerText))
                    {
                        EntityDict.Add(item.InnerText, entity);
                    }
                }
            }


            return(EntityDict);
        }