public void MaxentTaggerTest() { // Plain model in the file var model = Files.Parser.Models("pos-tagger/english-left3words-distsim.tagger"); using var fs = new FileStream(model, FileMode.Open); using var isw = new ikvm.io.InputStreamWrapper(fs); var tagger = new MaxentTagger(isw); Assert.NotNull(tagger); }
//public edu.stanford.nlp.parser.lexparser.LexicalizedParser lp; private void Ribbon1_Load(object sender, RibbonUIEventArgs e) { loadBoilerplate(); //Languages Word.Languages langs = Globals.ThisAddIn.Application.Languages; List <Tuple <string, Word.Language> > langlist = new List <Tuple <string, Word.Language> >(); foreach (Word.Language lang in langs) { langlist.Add(new Tuple <string, Word.Language>(lang.NameLocal, lang)); } langlist.Sort((x, y) => x.Item1.CompareTo(y.Item1)); foreach (Tuple <string, Word.Language> lang in langlist) { RibbonDropDownItem item = Globals.Factory.GetRibbonFactory().CreateRibbonDropDownItem(); item.Label = lang.Item1; item.Tag = lang.Item2.ID; dd_Langs.Items.Add(item); //if (lang.Item1 == Properties.Settings.Default.Options_ProofLanguage) //{ // dd_Langs.SelectedItem = item; //} } uint minlen = Properties.Settings.Default.Options_PhraseLengthMin; uint maxlen = Properties.Settings.Default.Options_PhraseLengthMax; edit_MinPhraseLen.Text = minlen.ToString(); edit_MaxPhraseLen.Text = maxlen.ToString(); //POSTagger Debug.WriteLine("Loading tagger model..."); MemoryStream _stream = new MemoryStream(Properties.Resources.posmodel); java.io.InputStream model = new ikvm.io.InputStreamWrapper(_stream); tagger = new MaxentTagger(model); Debug.WriteLine("Model loaded."); //Typed Dependencies //Debug.WriteLine("Loading lexical parser model..."); //_stream = new MemoryStream(Properties.Resources.englishPCFG_ser); //var isw = new ikvm.io.InputStreamWrapper(_stream); //var gzs = new java.util.zip.GZIPInputStream(isw); //var ois = new java.io.ObjectInputStream(gzs); //lp = LexicalizedParser.loadModel(ois); //Debug.WriteLine("Model loaded."); }
public void LexicalizedParserTest() { // GZIPed model in the file var model = Files.Parser.Models("lexparser/englishPCFG.ser.gz"); using var fs = new FileStream(model, FileMode.Open); using var isw = new ikvm.io.InputStreamWrapper(fs); using var ois = model.EndsWith(".gz") ? new ObjectInputStream(new GZIPInputStream(isw)) : new ObjectInputStream(isw); var lp = LexicalizedParser.loadModel(ois); Assert.NotNull(lp); }
/// <summary> /// Parses a pdf item and returns its content as string. /// </summary> /// <param name="mediaItem">MediaItem (should be a pdf - otherwise an empty string will be returned).</param> /// <returns>String represantation of the pdf content.</returns> public static string ParsePdf(MediaItem mediaItem) { if (mediaItem.MimeType != PdfMimetype) return string.Empty; PDDocument doc = null; ikvm.io.InputStreamWrapper wrapper = null; try { var stream = mediaItem.GetMediaStream(); wrapper = new ikvm.io.InputStreamWrapper(stream); doc = PDDocument.load(wrapper); if (doc.isEncrypted()) { string[] pwArray = LoadPasswords(); doc = Decrypt(doc, pwArray); if (doc == null) { Log.Warn("PdfUtil :: ParsePDF :: Decryption Failed for: [" + mediaItem.Name + "]", typeof(PdfUtil)); return string.Empty; } else { Log.Debug("PdfUtil :: ParsePDF :: Successfully decrypted [" + mediaItem.Name + "]", typeof(PdfUtil)); } } var stripper = new PDFTextStripper(); return stripper.getText(doc); } catch (Exception ex) { Log.Error("PdfUtil :: ParsePDF :: Error parsing pdf: [" + mediaItem.Name + "]", ex); return string.Empty; } finally { if (doc != null) { doc.close(); wrapper.close(); } } }
public TExtractionResult Parse <TExtractionResult>( System.IO.Stream inputStream, Func <string, Metadata, TExtractionResult> extractionResultAssembler ) { try { return(Parse(SystemStreamFactory, extractionResultAssembler)); } catch (Exception ex) { throw new TextExtractionException("Extraction of text from stream failed.", ex); } InputStream SystemStreamFactory(Metadata metadata) { var ioStream = new ikvm.io.InputStreamWrapper(inputStream); var result = TikaInputStream.get(ioStream); return(result); } }
public static string ConvertText(string javaText, JavaConversionOptions options = null) { if (options == null) options = new JavaConversionOptions(); options.ConversionStateChanged(ConversionState.Starting); var context = new ConversionContext(options); var textBytes = Encoding.UTF8.GetBytes(javaText ?? string.Empty); using (var stringreader = new MemoryStream(textBytes)) using (var wrapper = new ikvm.io.InputStreamWrapper(stringreader)) { options.ConversionStateChanged(ConversionState.ParsingJavaAST); var parsed = JavaParser.parse(wrapper); options.ConversionStateChanged(ConversionState.BuildingCSharpAST); var types = parsed.getTypes().ToList<TypeDeclaration>(); var imports = parsed.getImports(); if (imports == null || imports.isEmpty()) options.IncludeUsings = false; var package = parsed.getPackage(); if (package == null) options.IncludeNamespace = false; var usings = new List<UsingDirectiveSyntax>(); //foreach (var import in imports) //{ // var usingSyntax = Syntax.UsingDirective(Syntax.ParseName(import.getName().toString())); // usings.Add(usingSyntax); //} if (options.IncludeUsings) { foreach (var ns in options.Usings) { var usingSyntax = Syntax.UsingDirective(Syntax.ParseName(ns)); usings.Add(usingSyntax); } } var rootMembers = new List<MemberDeclarationSyntax>(); NamespaceDeclarationSyntax namespaceSyntax = null; if (options.IncludeNamespace) { string packageName = package.getName().toString(); foreach (var packageReplacement in options.PackageReplacements) { packageName = packageReplacement.Replace(packageName); } packageName = TypeHelper.Capitalize(packageName); namespaceSyntax = Syntax.NamespaceDeclaration(Syntax.ParseName(packageName)); } foreach (var type in types) { if (type is ClassOrInterfaceDeclaration) { var classOrIntType = type as ClassOrInterfaceDeclaration; if (classOrIntType.isInterface()) { var interfaceSyntax = VisitInterfaceDeclaration(context, classOrIntType, false); if (options.IncludeNamespace) namespaceSyntax = namespaceSyntax.AddMembers(interfaceSyntax); else if (options.IncludeClass) rootMembers.Add(interfaceSyntax); } else { var classSyntax = VisitClassDeclaration(context, classOrIntType, false); if (options.IncludeNamespace) namespaceSyntax = namespaceSyntax.AddMembers(classSyntax); else if (options.IncludeClass) rootMembers.Add(classSyntax); else rootMembers.AddRange(classSyntax.Members.OfType<MethodDeclarationSyntax>()); } } } if (options.IncludeNamespace) rootMembers.Add(namespaceSyntax); var root = Syntax.CompilationUnit( externs: null, usings: Syntax.List(usings.ToArray()), attributeLists: null, members: Syntax.List<MemberDeclarationSyntax>(rootMembers)) .NormalizeWhitespace(); var tree = SyntaxTree.Create(root); options.ConversionStateChanged(ConversionState.Done); return tree.GetText().ToString(); } }
public static string ConvertText(string javaText, JavaConversionOptions options = null) { if (options == null) { options = new JavaConversionOptions(); } options.ConversionStateChanged(ConversionState.Starting); var context = new ConversionContext(options); var textBytes = Encoding.UTF8.GetBytes(javaText ?? string.Empty); using (var stringreader = new MemoryStream(textBytes)) using (var wrapper = new ikvm.io.InputStreamWrapper(stringreader)) { options.ConversionStateChanged(ConversionState.ParsingJavaAST); var parsed = JavaParser.parse(wrapper); options.ConversionStateChanged(ConversionState.BuildingCSharpAST); var types = parsed.getTypes().ToList <TypeDeclaration>(); var imports = parsed.getImports().ToList <ImportDeclaration>(); var package = parsed.getPackage(); var usings = new List <UsingDirectiveSyntax>(); //foreach (var import in imports) //{ // var usingSyntax = SyntaxFactory.UsingDirective(SyntaxFactory.ParseName(import.getName().toString())); // usings.Add(usingSyntax); //} if (options.IncludeUsings) { foreach (var ns in options.Usings.Where(x => !string.IsNullOrWhiteSpace(x))) { var usingSyntax = SyntaxFactory.UsingDirective(SyntaxFactory.ParseName(ns)); usings.Add(usingSyntax); } } var rootMembers = new List <MemberDeclarationSyntax>(); NamespaceDeclarationSyntax namespaceSyntax = null; if (options.IncludeNamespace) { string packageName = package.getName().toString(); foreach (var packageReplacement in options.PackageReplacements) { packageName = packageReplacement.Replace(packageName); } packageName = TypeHelper.Capitalize(packageName); namespaceSyntax = SyntaxFactory.NamespaceDeclaration(SyntaxFactory.ParseName(packageName)); } foreach (var type in types) { if (type is ClassOrInterfaceDeclaration) { var classOrIntType = type as ClassOrInterfaceDeclaration; if (classOrIntType.isInterface()) { var interfaceSyntax = VisitInterfaceDeclaration(context, classOrIntType, false); if (options.IncludeNamespace) { namespaceSyntax = namespaceSyntax.AddMembers(interfaceSyntax); } else { rootMembers.Add(interfaceSyntax); } } else { var classSyntax = VisitClassDeclaration(context, classOrIntType, false); if (options.IncludeNamespace) { namespaceSyntax = namespaceSyntax.AddMembers(classSyntax); } else { rootMembers.Add(classSyntax); } } } } if (options.IncludeNamespace) { rootMembers.Add(namespaceSyntax); } var root = SyntaxFactory.CompilationUnit( externs: new SyntaxList <ExternAliasDirectiveSyntax>(), usings: SyntaxFactory.List(usings.ToArray()), attributeLists: new SyntaxList <AttributeListSyntax>(), members: SyntaxFactory.List <MemberDeclarationSyntax>(rootMembers)) .NormalizeWhitespace(); var tree = SyntaxFactory.SyntaxTree(root); options.ConversionStateChanged(ConversionState.Done); return(tree.GetText().ToString()); } }
public static string GetContent(MediaItem oMediaItem) { string sResult = string.Empty; PDDocument doc = null; ikvm.io.InputStreamWrapper wrapper = null; if (oMediaItem != null && oMediaItem.Extension == "pdf") { try { //Stream stream = oMediaItem.GetMediaStream(); //MemoryStream ms = new MemoryStream(); //stream.CopyTo(ms); //ByteArrayInputStream oByteArrayStream = new ByteArrayInputStream(ms.GetBuffer()); //ms.Close(); //stream.Close(); //doc = PDDocument.load(oByteArrayStream); //PDFTextStripper stripper = new PDFTextStripper(); //sResult = stripper.getText(doc); /* * Using above code, will generate several exceptions on content retrieval, all with the same error message, example: * 23864 18:28:32 ERROR PDF GetContent: Could not extract from mediaItem: {57F2C94F-1204-4975-BABE-287B167F07AF}, media item path: [/sitecore/media library/Files/Investments/AssetMark/Monthly Performance/09-2013/GFWM_3301_MonthlyPerformanceAR_2013_09_C11582pdf?sc_database=web] * Exception: java.io.IOException * Message: Push back buffer is full * Source: pdfbox-1.6.0 * at org.apache.pdfbox.pdfparser.PDFParser.parse() * at org.apache.pdfbox.pdmodel.PDDocument.load(InputStream input, RandomAccess scratchFile) * at org.apache.pdfbox.pdmodel.PDDocument.load(InputStream input) * at Genworth.SitecoreExt.Helpers.PDFExtractorHelper.GetContent(MediaItem oMediaItem) * * According to this web reference (https://issues.apache.org/jira/browse/PDFBOX-1818), the issue has been fixed on 1.8.4 and 2.0.0 versions. Current PDFbox version 1.6.0. * * However, code below does not bring the issues as in above's code. */ wrapper = new ikvm.io.InputStreamWrapper(oMediaItem.GetMediaStream()); doc = PDDocument.load(wrapper); sResult = new PDFTextStripper().getText(doc); int contentLength = sResult != null ? sResult.Length : 0; Sitecore.Diagnostics.Log.Info("PDF Content extraction successful, media item ID: " + oMediaItem.ID.ToString() + ", content length: " + contentLength, typeof(PDFExtractorHelper)); } catch (Exception Ex) { Sitecore.Diagnostics.Log.Error("PDF GetContent: Could not extract from mediaItem: " + oMediaItem.ID.ToString() + ", media item path: [" + oMediaItem.Path + "]", Ex, typeof(PDFExtractorHelper)); } finally { if (doc != null) { doc.close(); } if (wrapper != null) { wrapper.close(); } } } return(sResult); }