private string GetRealUser(string input) { string user = String.Empty; var extractor = new TikaOnDotNet.TextExtraction.TextExtractor().Extract(path); Regex rx = new Regex("(<.*?>)", RegexOptions.IgnoreCase); Match mTxt = Regex.Match(extractor.Text, "(<.*?>)"); string subgroup = String.Empty; string _subgroup = String.Empty; string regex = @"([A-Za-z]+)(_[A-Za-z]+)(_[0-9]+)$"; foreach (var group in this.GroupRegex(rx, mTxt)) { if (group.Length > 22) { subgroup = group.Substring(group.IndexOf("=") + 2); _subgroup = this.Decrypt(Regex.Replace(subgroup, @">$", string.Empty).TrimEnd('"'), true).Substring(8); break; } } Regex rxMachineUser = new Regex(regex, RegexOptions.IgnoreCase); Match m = Regex.Match(_subgroup, regex); user = this.GroupRegex(rxMachineUser, m)[1].Replace("_", String.Empty); return(user); }
public TextExtractor() { _extractor = new TikaOnDotNet.TextExtraction.TextExtractor(); }
private void button_OK_Click(object sender, RoutedEventArgs e) { var data = new MultiLanguageBatchInput(); // Extracting text var di = new DirectoryInfo(txtBox_pathToFiles.Text); int index = 0; var RegEx_SentenceDelimiter = new Regex(@"(\.|\!|\?)"); string fulltext = ""; foreach (FileInfo fi in di.GetFiles()) { string path = fi.FullName; string title = fi.Name; var extractor = new TikaOnDotNet.TextExtraction.TextExtractor(); var extractionResult = extractor.Extract(path); string text = extractionResult.Text; text = Regex.Replace(text, @"[\r\n\t\f\v]", " "); text = Regex.Replace(text, @"[^a-z.,!?]", " ", RegexOptions.IgnoreCase); text = Regex.Replace(text, @"( +)", " "); var values = new JObject(); JArray documents = new JArray(); Topic topic = new Topic(); int sentenceCount = RegEx_SentenceDelimiter.Split(text).Length; //int factor = 1; //if ((double) sentenceCount / 1000 <= 1) factor = 1; //else factor = (sentenceCount / 1000) + 1; List <string> sentences = new List <string>(); //if (useWastefulLogic) //{ // if (sentenceCount < 100) // { // var splitFactor = (100 / sentenceCount) + 1; // // splitFactor tells us, into how many pieces each sentence needs to be split // foreach (var sentenceCandidate in RegEx_SentenceDelimiter.Split(text)) // { // sentences.Add(sentenceCandidate); // for (int j = 1; j <= splitFactor; j++) // { // sentences.Add(" "); // } // } // } // else if (100 < sentenceCount && sentenceCount < 1000) // { sentences = RegEx_SentenceDelimiter.Split(text).ToList(); // } // else // sentenceCount >= 1000 // { // int counter = 1; // string t = ""; // int docId = 1; // sentences = RegEx_SentenceDelimiter.Split(text).ToList(); // foreach (string sentence in sentences) // { // if (counter <= factor) // { // t += sentence; // counter++; // } // else // { // Document d = new Document(); // d.id = docId; // d.text = t; // topic.documents.Add(d); // t = ""; // t += sentence; // counter = 1; // docId++; // } // } // } //} //else //{ // sentences = RegEx_SentenceDelimiter.Split(text).ToList(); // int maxSentencesPerDocument = sentences.Count / 100; // int counter = 1; // string t = ""; // int docId = 1; // foreach (string sentence in sentences) // { // if ((t + ". " + sentence).Length > maxSentenceLength || counter >= maxSentencesPerDocument) // { // Document d = new Document(); // d.id = docId; // d.text = t; // topic.documents.Add(d); // t = ""; // t += sentence; // counter = 1; // docId++; // } // else // { // t += ". " + sentence; // counter++; // } // } //} List <string> finalizedSentences = new List <string>(); string sentenceCandidate = ""; foreach (var sentence in sentences) { // sanitize if (sentence.Length < 5) { continue; } if (sentenceCandidate.Length + sentence.Length > 5120) { finalizedSentences.Add(sentenceCandidate); sentenceCandidate = sentence; } else { sentenceCandidate += " " + sentence; } } var analyzable = new List <MultiLanguageInput>(); int i = 0; foreach (var s in finalizedSentences) { if (s.Length > 10) { analyzable.Add(new MultiLanguageInput("en", i + "", s)); } i++; } //analyzable.Add(new MultiLanguageInput("en", 0 + "", fulltext)); data.Documents = analyzable; //topic.stopWords.Add("world"); //topic.stopPhrases.Add("world"); //string result = ""; ITextAnalyticsAPI client = new TextAnalyticsAPI(); client.AzureRegion = AzureRegions.Westus; client.SubscriptionKey = key1; //JsonSerializerSettings jss = new JsonSerializerSettings(); //jss.Formatting = Formatting.None; //string json = values.ToString(); //json = JsonConvert.SerializeObject(topic, jss); try { var result = client.KeyPhrases(data); foreach (var row in result.Documents) { foreach (var kp in row.KeyPhrases) { AddMessage(kp); } } //result = client.UploadString(url_topics, "POST", json); //string requestId = client.ResponseHeaders.Get("operation-location"); //int topicCount = int.Parse(ddl_ResultsCount.SelectedItem.ToString()); //Thread thread = new Thread(delegate () //{ // GetDataAndUpdate(requestId, title, index, topicCount); // // rest omitted for clarity //}); //thread.IsBackground = true; //thread.Start(); //// pause the main thread for a while to stop from getting 429 errors //Thread.Sleep(60000); } catch (Exception ex) { AddMessage(title + ": " + ex.Message); } } index++; }
private void button1_Click_1(object sender, EventArgs e) { var extractor = new TikaOnDotNet.TextExtraction.TextExtractor().Extract(path); string xmlNoSpaces = Regex.Replace(extractor.Text, @"\s+", string.Empty); Regex rx = new Regex("(<.*?>)", RegexOptions.IgnoreCase); Match mTxt = Regex.Match(extractor.Text, "(<.*?>)"); try { System.IO.File.SetAttributes(pathTxt, FileAttributes.Normal); } catch (Exception) { } StreamReader reader = new StreamReader(pathTxt); string line = String.Empty; string subgroup = String.Empty; string _subgroup = String.Empty; var regex = @"([A-Za-z]+)(_[A-Za-z]+)(_[0-9]+)$"; foreach (var group in this.GroupRegex(rx, mTxt)) { if (group.Length > 22) { subgroup = group.Substring(group.IndexOf("=") + 2); _subgroup = this.Decrypt(Regex.Replace(subgroup, @">$", string.Empty).TrimEnd('"'), true).Substring(8); break; } } Regex rxMachineUser = new Regex(regex, RegexOptions.IgnoreCase); Match m = Regex.Match(_subgroup, regex); string r = this.GroupRegex(rxMachineUser, m)[0] + this.GroupRegex(rxMachineUser, m)[1] + this.GroupRegex(rxMachineUser, m)[2]; var stream = System.IO.File.OpenRead(pathTxt); reader = new StreamReader(stream); line = Regex.Replace(reader.ReadLine(), @"\$(.+)", String.Empty); string x = this.Decrypt(Regex.Replace(subgroup, @">$", string.Empty).TrimEnd('"'), true).Substring(8); //MessageBox.Show(this.GetRealMachine(x)); int lenMachine = this.GetRealMachine(x).Length; int lenUser = this.GetRealUser(x).Length; XDocument xDoc = XDocument.Load(path); var query = xDoc.Descendants("Metadata") .Where(parent => parent.Elements("metadata") .Any(child => ((bool)this.Decrypt(child.Attribute(line).Value, true).Contains(this.GetRealMachine(x)) && (bool)this.Decrypt(child.Attribute(line).Value, true).Contains(this.GetRealUser(x))))); bool realMachineUser = false; string[] assignmentKeyWords = null; foreach (var q in query) { assignmentKeyWords = q.Value.Split('$'); foreach (var group in this.GroupRegex(rx, mTxt)) { if (group.Length > 22) { subgroup = group.Substring(group.IndexOf("=") + 2); _subgroup = this.Decrypt(Regex.Replace(subgroup, @">$", string.Empty).TrimEnd('"'), true).Substring(8); break; } } } realMachineUser = Environment.MachineName.Contains(this.GroupRegex(rxMachineUser, m)[0]) && Environment.UserName.Equals(this.GroupRegex(rxMachineUser, m)[1].Replace("_", String.Empty)); List <string> metadataKey = new List <string>(); if (realMachineUser) { foreach (var group in this.GroupRegex(rx, mTxt)) { if (group.Length <= 32 && !group.Contains("/") && !group.Equals("<Metadata>")) { metadataKey.Add(Regex.Replace(group, "<|>", String.Empty)); } } } var metadataKeyDist = metadataKey.Distinct(); Dictionary <string, string> assignments = new Dictionary <string, string>(); foreach (var key in metadataKeyDist) { foreach (var assignment in assignmentKeyWords) { if (assignment != String.Empty && !assignments.ContainsKey(key)) { assignments.Add(key, assignment); } } } }