示例#1
0
        public void ParallelTestWithLokalParser()
        {
            var plist = new List <KeyValuePair <string, string> >();

            var list = new List <KeyValuePair <string, string> >();

            list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.pptx", "Tika Test Presentation"));
            list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.docx", "formatted in interesting ways"));
            list.Add(new KeyValuePair <string, string>(_filePathParent + "files/Tika.xlsx", "Use the force duke"));

            for (int i = 0; i < 1000; i++)
            {
                plist.AddRange(list);
            }

            Parallel.ForEach(plist, (test) =>
            {
                var tika2  = new Tika();
                var result = tika2.Parse(test.Key);
                result.Text.Should().Contain(test.Value);

                var result2 = tika.ParseToString(test.Key);
                result2.Should().Contain(test.Value);
            });
        }
        /// <summary>
        /// Method doing actually something with the changes obtained via the web hook notification.
        /// </summary>
        private static void DoWork(ClientContext cc, List changeList, Change change)
        {
            //Get Fields, if no fields then don't do anything
            var taxonomyTerms = CloudConfigurationManager.GetSetting("TaxonomyTermNames")
                                ?.Split(",".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

            if (!taxonomyTerms.Any())
            {
                return;
            }

            // Get the list item from the Change List
            // Note that this is the ID of the item in the list, not a reference to its position.
            var targetListItem = changeList.GetItemById(((ChangeItem)change).ItemId);

            cc.Load(targetListItem.File);

            // Get the File Binary Stream
            var streamResult = targetListItem.File.OpenBinaryStream();

            cc.ExecuteQueryRetry();

            string textFromStream;

            try
            {
                // Get Text Rendition of document binary
                var tika = new Tika();
                textFromStream = tika.ParseToString(streamResult.Value);
            }
            catch (Exception ex)
            {
                Trace.TraceWarning($"Tika Error: {ex}");
                return;
            }

            // Get Key phrases from text rendition
            var client = new TextAnalyticsClient();
            var result = client.GetStringPhrasesEntities(textFromStream).Result;

            // list of distinct key phrases
            var keyPhrases = result as string[] ?? result.ToArray();

            Trace.TraceInformation($"Key Phrases: {string.Join(",", keyPhrases)}");

            try
            {
                var results = taxonomyTerms.Select(term =>
                                                   SetTaxFieldValueAgainstKeyPhrase(cc, changeList, targetListItem, term, keyPhrases)).ToList();
            }
            catch (Exception ex)
            {
                Trace.TraceError($"Error: {ex}");
            }
        }
示例#3
0
 public string ReadPdfFile(string filePath)
 {
     try
     {
         Tika t = new Tika();
         return(t.parseToString(new java.io.File(filePath)));
     }
     catch (Exception)
     {
         return("");
     }
 }
示例#4
0
        public void TestTikaAutodetect()
        {
            Tika tika = new Tika();
            File xpsFile = new File("samples\\test1.xps");
		    if (!xpsFile.isFile())
			    throw new Exception(xpsFile.getName() + " does not exists.");

            using (InputStream inputStream = new FileInputStream(xpsFile))
            {
                Metadata metadata = new Metadata();

                string mimeType = tika.detect(inputStream, metadata);
                Assert.AreEqual("application/x-tika-ooxml", mimeType);

                inputStream.close();
            }
        }
示例#5
0
 public virtual void SetUp()
 {
     _cut            = new Tika();
     _filePathParent = AppDomain.CurrentDomain.SetupInformation.ApplicationBase;
 }