Наследование: ITextExtractor
Пример #1
0
        private string GetRealUser(string input)
        {
            string user      = String.Empty;
            var    extractor = new TikaOnDotNet.TextExtraction.TextExtractor().Extract(path);
            Regex  rx        = new Regex("(<.*?>)", RegexOptions.IgnoreCase);
            Match  mTxt      = Regex.Match(extractor.Text, "(<.*?>)");
            string subgroup  = String.Empty;
            string _subgroup = String.Empty;
            string regex     = @"([A-Za-z]+)(_[A-Za-z]+)(_[0-9]+)$";

            foreach (var group in this.GroupRegex(rx, mTxt))
            {
                if (group.Length > 22)
                {
                    subgroup  = group.Substring(group.IndexOf("=") + 2);
                    _subgroup = this.Decrypt(Regex.Replace(subgroup, @">$", string.Empty).TrimEnd('"'), true).Substring(8);
                    break;
                }
            }
            Regex rxMachineUser = new Regex(regex, RegexOptions.IgnoreCase);
            Match m             = Regex.Match(_subgroup, regex);

            user = this.GroupRegex(rxMachineUser, m)[1].Replace("_", String.Empty);
            return(user);
        }
Пример #2
0
 public TextExtractor()
 {
     _extractor = new TikaOnDotNet.TextExtraction.TextExtractor();
 }
        private void button_OK_Click(object sender, RoutedEventArgs e)
        {
            var data = new MultiLanguageBatchInput();

            // Extracting text
            var di    = new DirectoryInfo(txtBox_pathToFiles.Text);
            int index = 0;

            var RegEx_SentenceDelimiter = new Regex(@"(\.|\!|\?)");

            string fulltext = "";

            foreach (FileInfo fi in di.GetFiles())
            {
                string path  = fi.FullName;
                string title = fi.Name;

                var    extractor        = new TikaOnDotNet.TextExtraction.TextExtractor();
                var    extractionResult = extractor.Extract(path);
                string text             = extractionResult.Text;

                text = Regex.Replace(text, @"[\r\n\t\f\v]", " ");
                text = Regex.Replace(text, @"[^a-z.,!?]", " ", RegexOptions.IgnoreCase);
                text = Regex.Replace(text, @"( +)", " ");

                var values = new JObject();

                JArray documents = new JArray();
                Topic  topic     = new Topic();

                int sentenceCount = RegEx_SentenceDelimiter.Split(text).Length;

                //int factor = 1;
                //if ((double) sentenceCount / 1000 <= 1) factor = 1;
                //else factor = (sentenceCount / 1000) + 1;

                List <string> sentences = new List <string>();

                //if (useWastefulLogic)
                //{
                //    if (sentenceCount < 100)
                //    {
                //        var splitFactor = (100 / sentenceCount) + 1;

                //        // splitFactor tells us, into how many pieces each sentence needs to be split
                //        foreach (var sentenceCandidate in RegEx_SentenceDelimiter.Split(text))
                //        {
                //            sentences.Add(sentenceCandidate);

                //            for (int j = 1; j <= splitFactor; j++)
                //            {
                //                sentences.Add(" ");
                //            }
                //        }
                //    }
                //    else if (100 < sentenceCount && sentenceCount < 1000)
                //    {
                sentences = RegEx_SentenceDelimiter.Split(text).ToList();
                //    }
                //    else // sentenceCount >= 1000
                //    {
                //        int counter = 1;
                //        string t = "";
                //        int docId = 1;

                //        sentences = RegEx_SentenceDelimiter.Split(text).ToList();

                //        foreach (string sentence in sentences)
                //        {
                //            if (counter <= factor)
                //            {
                //                t += sentence;

                //                counter++;
                //            }
                //            else
                //            {
                //                Document d = new Document();
                //                d.id = docId;
                //                d.text = t;
                //                topic.documents.Add(d);

                //                t = "";
                //                t += sentence;
                //                counter = 1;

                //                docId++;
                //            }
                //        }
                //    }
                //}
                //else
                //{
                //    sentences = RegEx_SentenceDelimiter.Split(text).ToList();

                //    int maxSentencesPerDocument = sentences.Count / 100;

                //    int counter = 1;
                //    string t = "";
                //    int docId = 1;

                //    foreach (string sentence in sentences)
                //    {
                //        if ((t + ". " + sentence).Length > maxSentenceLength || counter >= maxSentencesPerDocument)
                //        {
                //            Document d = new Document();
                //            d.id = docId;
                //            d.text = t;
                //            topic.documents.Add(d);

                //            t = "";
                //            t += sentence;
                //            counter = 1;

                //            docId++;
                //        }
                //        else
                //        {
                //            t += ". " + sentence;

                //            counter++;
                //        }
                //    }
                //}

                List <string> finalizedSentences = new List <string>();

                string sentenceCandidate = "";
                foreach (var sentence in sentences)
                {
                    // sanitize
                    if (sentence.Length < 5)
                    {
                        continue;
                    }

                    if (sentenceCandidate.Length + sentence.Length > 5120)
                    {
                        finalizedSentences.Add(sentenceCandidate);
                        sentenceCandidate = sentence;
                    }
                    else
                    {
                        sentenceCandidate += " " + sentence;
                    }
                }

                var analyzable = new List <MultiLanguageInput>();

                int i = 0;
                foreach (var s in finalizedSentences)
                {
                    if (s.Length > 10)
                    {
                        analyzable.Add(new MultiLanguageInput("en", i + "", s));
                    }
                    i++;
                }
                //analyzable.Add(new MultiLanguageInput("en", 0 + "", fulltext));
                data.Documents = analyzable;



                //topic.stopWords.Add("world");

                //topic.stopPhrases.Add("world");


                //string result = "";

                ITextAnalyticsAPI client = new TextAnalyticsAPI();
                client.AzureRegion     = AzureRegions.Westus;
                client.SubscriptionKey = key1;

                //JsonSerializerSettings jss = new JsonSerializerSettings();
                //jss.Formatting = Formatting.None;

                //string json = values.ToString();

                //json = JsonConvert.SerializeObject(topic, jss);

                try
                {
                    var result = client.KeyPhrases(data);

                    foreach (var row in result.Documents)
                    {
                        foreach (var kp in row.KeyPhrases)
                        {
                            AddMessage(kp);
                        }
                    }

                    //result = client.UploadString(url_topics, "POST", json);

                    //string requestId = client.ResponseHeaders.Get("operation-location");
                    //int topicCount = int.Parse(ddl_ResultsCount.SelectedItem.ToString());

                    //Thread thread = new Thread(delegate ()
                    //{
                    //    GetDataAndUpdate(requestId, title, index, topicCount);
                    //    // rest omitted for clarity
                    //});
                    //thread.IsBackground = true;
                    //thread.Start();

                    //// pause the main thread for a while to stop from getting 429 errors
                    //Thread.Sleep(60000);
                }
                catch (Exception ex)
                {
                    AddMessage(title + ": " + ex.Message);
                }
            }
            index++;
        }
Пример #4
0
        private void button1_Click_1(object sender, EventArgs e)
        {
            var    extractor   = new TikaOnDotNet.TextExtraction.TextExtractor().Extract(path);
            string xmlNoSpaces = Regex.Replace(extractor.Text, @"\s+", string.Empty);

            Regex rx   = new Regex("(<.*?>)", RegexOptions.IgnoreCase);
            Match mTxt = Regex.Match(extractor.Text, "(<.*?>)");

            try
            {
                System.IO.File.SetAttributes(pathTxt, FileAttributes.Normal);
            }
            catch (Exception) { }
            StreamReader reader = new StreamReader(pathTxt);

            string line = String.Empty;

            string subgroup  = String.Empty;
            string _subgroup = String.Empty;
            var    regex     = @"([A-Za-z]+)(_[A-Za-z]+)(_[0-9]+)$";

            foreach (var group in this.GroupRegex(rx, mTxt))
            {
                if (group.Length > 22)
                {
                    subgroup  = group.Substring(group.IndexOf("=") + 2);
                    _subgroup = this.Decrypt(Regex.Replace(subgroup, @">$", string.Empty).TrimEnd('"'), true).Substring(8);
                    break;
                }
            }

            Regex  rxMachineUser = new Regex(regex, RegexOptions.IgnoreCase);
            Match  m             = Regex.Match(_subgroup, regex);
            string r             = this.GroupRegex(rxMachineUser, m)[0] + this.GroupRegex(rxMachineUser, m)[1] + this.GroupRegex(rxMachineUser, m)[2];

            var stream = System.IO.File.OpenRead(pathTxt);

            reader = new StreamReader(stream);

            line = Regex.Replace(reader.ReadLine(), @"\$(.+)", String.Empty);

            string x = this.Decrypt(Regex.Replace(subgroup, @">$", string.Empty).TrimEnd('"'), true).Substring(8);
            //MessageBox.Show(this.GetRealMachine(x));
            int lenMachine = this.GetRealMachine(x).Length;
            int lenUser    = this.GetRealUser(x).Length;

            XDocument xDoc = XDocument.Load(path);

            var query = xDoc.Descendants("Metadata")
                        .Where(parent => parent.Elements("metadata")
                               .Any(child =>
                                    ((bool)this.Decrypt(child.Attribute(line).Value, true).Contains(this.GetRealMachine(x)) &&
                                     (bool)this.Decrypt(child.Attribute(line).Value, true).Contains(this.GetRealUser(x)))));
            bool realMachineUser = false;

            string[] assignmentKeyWords = null;
            foreach (var q in query)
            {
                assignmentKeyWords = q.Value.Split('$');
                foreach (var group in this.GroupRegex(rx, mTxt))
                {
                    if (group.Length > 22)
                    {
                        subgroup  = group.Substring(group.IndexOf("=") + 2);
                        _subgroup = this.Decrypt(Regex.Replace(subgroup, @">$", string.Empty).TrimEnd('"'), true).Substring(8);
                        break;
                    }
                }
            }
            realMachineUser = Environment.MachineName.Contains(this.GroupRegex(rxMachineUser, m)[0]) && Environment.UserName.Equals(this.GroupRegex(rxMachineUser, m)[1].Replace("_", String.Empty));
            List <string> metadataKey = new List <string>();

            if (realMachineUser)
            {
                foreach (var group in this.GroupRegex(rx, mTxt))
                {
                    if (group.Length <= 32 && !group.Contains("/") && !group.Equals("<Metadata>"))
                    {
                        metadataKey.Add(Regex.Replace(group, "<|>", String.Empty));
                    }
                }
            }

            var metadataKeyDist = metadataKey.Distinct();
            Dictionary <string, string> assignments = new Dictionary <string, string>();

            foreach (var key in metadataKeyDist)
            {
                foreach (var assignment in assignmentKeyWords)
                {
                    if (assignment != String.Empty && !assignments.ContainsKey(key))
                    {
                        assignments.Add(key, assignment);
                    }
                }
            }
        }