Exemple #1
0
        public static string getDocumentContent(string path)
        {
            var extractor = new TikaOnDotNet.TextExtraction.TextExtractor();
            var something = extractor.Extract(path);

            return(extractor.Extract(path).Text);
        }
Exemple #2
0
 public string ExtractFromDocument(string filePath)
 {
     try
     {
         var extractor = new TikaOnDotNet.TextExtraction.TextExtractor();
         var text      = extractor.Extract(filePath);
         return(text.Text);
     }
     catch (Exception) { return(null); }
 }
Exemple #3
0
        public static void AddDocument(string path)
        {
            var extractor = new TikaOnDotNet.TextExtraction.TextExtractor();
            var result    = extractor.Extract(path);

            string description = result.Metadata?.FirstOrDefault(x => x.Key.ToLower() == "description").Value; //pobiera wartości z metadanych
            string author      = result.Metadata?.FirstOrDefault(x => x.Key.ToLower() == "author").Value;
            string title       = result.Metadata?.FirstOrDefault(x => x.Key.ToLower() == "title").Value;

            var document = new Models.DocumentModel()
            {
                FileName    = Path.GetFileName(path),
                Author      = (author == null) ? "": author,
                Description = (description == null) ? "" : description,
                Title       = (title == null) ? "" : title
            };

            using (var db = new DocumentDbContext())
            {
                db.Documents.Add(document);
                db.SaveChanges();
            }
            Services.LuceneSearch.AddUpdateLuceneIndex(document);
        }
Exemple #4
0
        public static async Task <HttpResponseMessage> Run([HttpTrigger(AuthorizationLevel.Anonymous, "get", "post", Route = null)] HttpRequestMessage req, TraceWriter log)
        {
            string message = "";

            List <string> keyPhrases = new List <string>();

            string description = "";

            // Get request body
            dynamic data = await req.Content.ReadAsAsync <object>();

            string id = data.ID;

            log.Info("Document id: " + id);
            message += "Document id: " + id;

            OfficeDevPnP.Core.AuthenticationManager authManager = new OfficeDevPnP.Core.AuthenticationManager();
            try
            {
                // Connects to SharePoint online site
                using (var ctx = authManager.GetSharePointOnlineAuthenticatedContextTenant(siteUrl, userName, password))
                {
                    // List Name input
                    // Retrieves list object using title
                    List list = ctx.Site.RootWeb.GetListByTitle(listName);
                    if (list != null)
                    {
                        // Returns required result
                        ListItem li = list.GetItemById(id);

                        ctx.Load(li);
                        ctx.Load(li.File);
                        ctx.ExecuteQuery();

                        ctx.ExecuteQuery();

                        // We CAN extract text out of most documents with the library, but for this demo I'm limiting our options to these 2 that I know to be working :)
                        if (li.File.Name.IndexOf(".pdf") >= 0 || li.File.Name.IndexOf(".doc") >= 0)
                        {
                            li.File.OpenBinaryStream();

                            ctx.Load(li.File);
                            ctx.ExecuteQuery();

                            log.Info("It was a valid file! Continuing into handling...");


                            try
                            {
                                log.Info("Got a file! Name: " + li.File.Name);


                                var fileRef  = li.File.ServerRelativeUrl;
                                var fileInfo = Microsoft.SharePoint.Client.File.OpenBinaryDirect(ctx, fileRef);
                                fileInfo = Microsoft.SharePoint.Client.File.OpenBinaryDirect(ctx, fileRef);

                                using (var ms = new MemoryStream())
                                {
                                    log.Info("Extracting text..");

                                    fileInfo.Stream.CopyTo(ms);
                                    byte[] fileContents = ms.ToArray();

                                    var    extractor        = new TikaOnDotNet.TextExtraction.TextExtractor();
                                    var    extractionResult = extractor.Extract(fileContents);
                                    string text             = extractionResult.Text;

                                    List <MultiLanguageInput> analyzable = FormatAnalyzableText(ref text);

                                    log.Info("Formed altogether " + analyzable.Count + " sentences to analyze!");

                                    int snippetEnd = 500 < text.Length ? 500 : text.Length;
                                    log.Info("Extracted text! First few rows here.. \r\n " + text.Substring(0, snippetEnd));


                                    RunTextAnalysis(ref keyPhrases, analyzable, log);
                                }

                                log.Info("Found " + keyPhrases.Count + " key phrases! First 20 are here: ");
                                foreach (var kp in keyPhrases.Take(20))
                                {
                                    log.Info(kp);
                                }


                                try
                                {
                                    log.Info("Saving to SharePoint..");
                                    TextInfo ti = new CultureInfo("en-US", false).TextInfo;

                                    li["Title"] = ti.ToTitleCase(li.File.Name);

                                    // then write the most important keyphrases back
                                    foreach (var s in keyPhrases.Take(keywordCount))
                                    {
                                        description += s + "\r\n";
                                    }

                                    li.Update();

                                    try
                                    {
                                        ctx.Load(list.Fields);
                                        ctx.ExecuteQuery();

                                        log.Info("Updating Managed Metadata...");

                                        var fieldnames = new string[] { "Keywords" };
                                        var field      = list.GetFields(fieldnames).First();

                                        // setting managed metadata
                                        log.Info("Updating keywords to taxonomy! Taking: " + keywordCount);
                                        UpdateManagedMetadata(keyPhrases.Take(keywordCount).ToArray(), log, ctx, li, field, wantedGuid);

                                        ctx.ExecuteQuery();
                                    }
                                    catch (Exception ex)
                                    {
                                        log.Error(ex.Message);
                                    }
                                }
                                catch (Exception ex)
                                {
                                    log.Error(ex.Message);
                                }
                            }
                            catch (Exception ex)
                            {
                                log.Error(ex.Message);
                                return(req.CreateResponse(HttpStatusCode.InternalServerError, ex.Message));
                            }
                        }
                        else
                        {
                            return(req.CreateResponse(HttpStatusCode.OK, "File was not pdf or doc"));
                        }

                        return(req.CreateResponse(HttpStatusCode.OK, list.Id));
                    }
                    else
                    {
                        log.Info("List is not available on the site");
                        return(req.CreateResponse(HttpStatusCode.NotFound, "List is not available on the site"));
                    }
                }
            }
            catch (Exception ex)
            {
                log.Info("Error Message: " + ex.Message);
                message += "Error Message: " + ex.Message;
            }

            log.Info("");

            var returnable = JsonConvert.SerializeObject(keyPhrases);

            return(keyPhrases.Count <= 0
                ? req.CreateResponse(HttpStatusCode.BadRequest, "Couldn't analyze file. Please verify the POST payload!")
                : req.CreateResponse(HttpStatusCode.OK, returnable));
        }