/// <summary> /// Method doing actually something with the changes obtained via the web hook notification. /// </summary> private static void DoWork(ClientContext cc, List changeList, Change change) { //Get Fields, if no fields then don't do anything var taxonomyTerms = CloudConfigurationManager.GetSetting("TaxonomyTermNames") ?.Split(",".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); if (!taxonomyTerms.Any()) { return; } // Get the list item from the Change List // Note that this is the ID of the item in the list, not a reference to its position. var targetListItem = changeList.GetItemById(((ChangeItem)change).ItemId); cc.Load(targetListItem.File); // Get the File Binary Stream var streamResult = targetListItem.File.OpenBinaryStream(); cc.ExecuteQueryRetry(); string textFromStream; try { // Get Text Rendition of document binary var tika = new Tika(); textFromStream = tika.ParseToString(streamResult.Value); } catch (Exception ex) { Trace.TraceWarning($"Tika Error: {ex}"); return; } // Get Key phrases from text rendition var client = new TextAnalyticsClient(); var result = client.GetStringPhrasesEntities(textFromStream).Result; // list of distinct key phrases var keyPhrases = result as string[] ?? result.ToArray(); Trace.TraceInformation($"Key Phrases: {string.Join(",", keyPhrases)}"); try { var results = taxonomyTerms.Select(term => SetTaxFieldValueAgainstKeyPhrase(cc, changeList, targetListItem, term, keyPhrases)).ToList(); } catch (Exception ex) { Trace.TraceError($"Error: {ex}"); } }
public void ParallelTestWithCentralParser() { var plist = new List <KeyValuePair <string, string> >(); var list = new List <KeyValuePair <string, string> >(); list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.pptx", "Tika Test Presentation")); list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.docx", "formatted in interesting ways")); list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.xlsx", "Use the force duke")); for (int i = 0; i < 1000; i++) { plist.AddRange(list); } Parallel.ForEach(plist, (test) => { var result = tika.Parse(test.Key); result.Text.Should().Contain(test.Value); var result2 = tika.ParseToString(test.Key); result2.Should().Contain(test.Value); }); }
public void Simple_File_To_String_Parsing() { string text = tika.ParseToString(_filePathParent + "files/Tika.rtf"); text.Should().Contain("pack of pickled almonds"); }