/// <summary> /// Parses the contents of a raw, untagged DocFile into a new Document instance. /// </summary> /// <param name="doc">The raw, untagged DocFile to parse.</param> /// <returns> /// The contents of the DocFile composed into a fully reified /// <see cref="Document"/> instance. /// </returns> public Document DocumentFromDoc(DocFile doc) { var docx = new DocToDocXConverter(doc).ConvertFile() as DocXFile; var txt = new DocxToTextConverter(docx).ConvertFile(); return(new TaggedSourceParser(new TaggedFile(new SharpNLPTagger(this.TaggerMode, txt.FullPath).ProcessFile())).LoadDocument(txt.NameSansExt)); }
/// <summary> /// Returns a Task<string> which when awaited yields all of the text in the DocXFile. /// </summary> /// <returns>A Task<string> which when awaited yields all of the text in the DocXFile.</returns> public override async Task <string> LoadTextAsync() { var converter = new DocxToTextConverter(this); var txtFile = await converter.ConvertFileAsync().ConfigureAwait(false); return(await txtFile.LoadTextAsync().ConfigureAwait(false)); }
public void ConvertFileTest() { var target = new DocxToTextConverter(Input); TxtFile actual; actual = target.ConvertFile(); Check.That(actual.FullPath).Satisfies(File.Exists); }
public async Task ConvertFileAsyncTest() { var target = new DocxToTextConverter(Input); TxtFile actual; actual = await target.ConvertFileAsync(); Check.That(FileInfo(actual.FullPath)).Satisfies(x => x.Exists); }
public void Execute(IndexFieldEvent e) { var umbracoFileName = e.UmbracoProperty.Value != null?e.UmbracoProperty.Value.ToString() : string.Empty; var docPath = ConfigurationManager.AppSettings["Dexter:DocumentPath"]; var filePath = string.IsNullOrWhiteSpace(docPath) ? umbracoFileName : umbracoFileName.Replace("~/media", ConfigurationManager.AppSettings["Dexter:DocumentPath"]) .Replace("/media", ConfigurationManager.AppSettings["Dexter:DocumentPath"]); var text = string.Empty; switch (System.IO.Path.GetExtension(filePath)) { case ".pdf": text = new PdfToTextConverter().Convert(filePath); break; case ".doc": text = new DocToTextConverter().Convert(filePath); break; case ".xls": text = new XlsToTextConverter().Convert(filePath); break; case ".docx": text = new DocxToTextConverter().Convert(filePath); break; case ".xlsx": text = new XlsxToTextConverter().Convert(filePath); break; case ".pptx": text = new PptxToTextConverter().Convert(filePath); break; case ".ppt": text = System.IO.Path.GetFileName(filePath); break; case ".zip": text = new ZipToTextConverter().Convert(filePath); break; } e.Value = string.Join(" ", text.Split(new[] { ' ' }).Except(IGNORE)); }
/// <summary> /// Returns a single string containing all of the text in the DocXFile. /// </summary> /// <returns>A string containing all of the text in the DocXFile.</returns> public override string LoadText() { var converter = new DocxToTextConverter(this); return(converter.ConvertFile().LoadText()); }
public void DocxToTextConverterConstructorTest() { var target = new DocxToTextConverter(Input); Check.That(target.Original.FullPath).IsEqualTo(Input.FullPath); }