public void ParallelTestWithLokalParser() { var plist = new List <KeyValuePair <string, string> >(); var list = new List <KeyValuePair <string, string> >(); list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.pptx", "Tika Test Presentation")); list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.docx", "formatted in interesting ways")); list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.xlsx", "Use the force duke")); for (int i = 0; i < 1000; i++) { plist.AddRange(list); } Parallel.ForEach(plist, (test) => { var tika2 = new Tika(); var result = tika2.Parse(test.Key); result.Text.Should().Contain(test.Value); var result2 = tika.ParseToString(test.Key); result2.Should().Contain(test.Value); }); }
/// <summary> /// Method doing actually something with the changes obtained via the web hook notification. /// </summary> private static void DoWork(ClientContext cc, List changeList, Change change) { //Get Fields, if no fields then don't do anything var taxonomyTerms = CloudConfigurationManager.GetSetting("TaxonomyTermNames") ?.Split(",".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); if (!taxonomyTerms.Any()) { return; } // Get the list item from the Change List // Note that this is the ID of the item in the list, not a reference to its position. var targetListItem = changeList.GetItemById(((ChangeItem)change).ItemId); cc.Load(targetListItem.File); // Get the File Binary Stream var streamResult = targetListItem.File.OpenBinaryStream(); cc.ExecuteQueryRetry(); string textFromStream; try { // Get Text Rendition of document binary var tika = new Tika(); textFromStream = tika.ParseToString(streamResult.Value); } catch (Exception ex) { Trace.TraceWarning($"Tika Error: {ex}"); return; } // Get Key phrases from text rendition var client = new TextAnalyticsClient(); var result = client.GetStringPhrasesEntities(textFromStream).Result; // list of distinct key phrases var keyPhrases = result as string[] ?? result.ToArray(); Trace.TraceInformation($"Key Phrases: {string.Join(",", keyPhrases)}"); try { var results = taxonomyTerms.Select(term => SetTaxFieldValueAgainstKeyPhrase(cc, changeList, targetListItem, term, keyPhrases)).ToList(); } catch (Exception ex) { Trace.TraceError($"Error: {ex}"); } }
public string ReadPdfFile(string filePath) { try { Tika t = new Tika(); return(t.parseToString(new java.io.File(filePath))); } catch (Exception) { return(""); } }
public void TestTikaAutodetect() { Tika tika = new Tika(); File xpsFile = new File("samples\\test1.xps"); if (!xpsFile.isFile()) throw new Exception(xpsFile.getName() + " does not exists."); using (InputStream inputStream = new FileInputStream(xpsFile)) { Metadata metadata = new Metadata(); string mimeType = tika.detect(inputStream, metadata); Assert.AreEqual("application/x-tika-ooxml", mimeType); inputStream.close(); } }
public virtual void SetUp() { _cut = new Tika(); _filePathParent = AppDomain.CurrentDomain.SetupInformation.ApplicationBase; }