public void MaxentTaggerTest()
        {
            // Plain model in the file
            var model = Files.Parser.Models("pos-tagger/english-left3words-distsim.tagger");

            using var fs  = new FileStream(model, FileMode.Open);
            using var isw = new ikvm.io.InputStreamWrapper(fs);

            var tagger = new MaxentTagger(isw);

            Assert.NotNull(tagger);
        }
예제 #2
0
        //public edu.stanford.nlp.parser.lexparser.LexicalizedParser lp;

        private void Ribbon1_Load(object sender, RibbonUIEventArgs e)
        {
            loadBoilerplate();

            //Languages
            Word.Languages langs = Globals.ThisAddIn.Application.Languages;
            List <Tuple <string, Word.Language> > langlist = new List <Tuple <string, Word.Language> >();

            foreach (Word.Language lang in langs)
            {
                langlist.Add(new Tuple <string, Word.Language>(lang.NameLocal, lang));
            }
            langlist.Sort((x, y) => x.Item1.CompareTo(y.Item1));
            foreach (Tuple <string, Word.Language> lang in langlist)
            {
                RibbonDropDownItem item = Globals.Factory.GetRibbonFactory().CreateRibbonDropDownItem();
                item.Label = lang.Item1;
                item.Tag   = lang.Item2.ID;
                dd_Langs.Items.Add(item);
                //if (lang.Item1 == Properties.Settings.Default.Options_ProofLanguage)
                //{
                //    dd_Langs.SelectedItem = item;
                //}
            }
            uint minlen = Properties.Settings.Default.Options_PhraseLengthMin;
            uint maxlen = Properties.Settings.Default.Options_PhraseLengthMax;

            edit_MinPhraseLen.Text = minlen.ToString();
            edit_MaxPhraseLen.Text = maxlen.ToString();

            //POSTagger
            Debug.WriteLine("Loading tagger model...");
            MemoryStream _stream = new MemoryStream(Properties.Resources.posmodel);

            java.io.InputStream model = new ikvm.io.InputStreamWrapper(_stream);
            tagger = new MaxentTagger(model);
            Debug.WriteLine("Model loaded.");

            //Typed Dependencies
            //Debug.WriteLine("Loading lexical parser model...");
            //_stream = new MemoryStream(Properties.Resources.englishPCFG_ser);
            //var isw = new ikvm.io.InputStreamWrapper(_stream);
            //var gzs = new java.util.zip.GZIPInputStream(isw);
            //var ois = new java.io.ObjectInputStream(gzs);
            //lp = LexicalizedParser.loadModel(ois);
            //Debug.WriteLine("Model loaded.");
        }
        public void LexicalizedParserTest()
        {
            // GZIPed model in the file
            var model = Files.Parser.Models("lexparser/englishPCFG.ser.gz");

            using var fs  = new FileStream(model, FileMode.Open);
            using var isw = new ikvm.io.InputStreamWrapper(fs);

            using var ois =
                      model.EndsWith(".gz")
                    ? new ObjectInputStream(new GZIPInputStream(isw))
                    : new ObjectInputStream(isw);

            var lp = LexicalizedParser.loadModel(ois);

            Assert.NotNull(lp);
        }
예제 #4
0
        /// <summary>
        /// Parses a pdf item and returns its content as string.
        /// </summary>
        /// <param name="mediaItem">MediaItem (should be a pdf - otherwise an empty string will be returned).</param>
        /// <returns>String represantation of the pdf content.</returns>
        public static string ParsePdf(MediaItem mediaItem)
        {
            if (mediaItem.MimeType != PdfMimetype) return string.Empty;

            PDDocument doc = null;
            ikvm.io.InputStreamWrapper wrapper = null;

            try
            {
                var stream = mediaItem.GetMediaStream();
                wrapper = new ikvm.io.InputStreamWrapper(stream);
                doc = PDDocument.load(wrapper);

                if (doc.isEncrypted())
                {
                    string[] pwArray = LoadPasswords();

                    doc = Decrypt(doc, pwArray);
                    if (doc == null)
                    {
                        Log.Warn("PdfUtil :: ParsePDF :: Decryption Failed for: [" + mediaItem.Name + "]", typeof(PdfUtil));
                        return string.Empty;
                    }
                    else
                    {
                        Log.Debug("PdfUtil :: ParsePDF :: Successfully decrypted [" + mediaItem.Name + "]", typeof(PdfUtil));
                    }
                }

                var stripper = new PDFTextStripper();
                return stripper.getText(doc);
            }
            catch (Exception ex)
            {
                Log.Error("PdfUtil :: ParsePDF :: Error parsing pdf: [" + mediaItem.Name + "]", ex);
                return string.Empty;
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                    wrapper.close();
                }
            }
        }
        public TExtractionResult Parse <TExtractionResult>(
            System.IO.Stream inputStream,
            Func <string, Metadata, TExtractionResult> extractionResultAssembler
            )
        {
            try
            {
                return(Parse(SystemStreamFactory, extractionResultAssembler));
            }
            catch (Exception ex)
            {
                throw new TextExtractionException("Extraction of text from stream failed.", ex);
            }

            InputStream SystemStreamFactory(Metadata metadata)
            {
                var ioStream = new ikvm.io.InputStreamWrapper(inputStream);
                var result   = TikaInputStream.get(ioStream);

                return(result);
            }
        }
        public static string ConvertText(string javaText, JavaConversionOptions options = null)
        {
            if (options == null)
                options = new JavaConversionOptions();

            options.ConversionStateChanged(ConversionState.Starting);

            var context = new ConversionContext(options);

            var textBytes = Encoding.UTF8.GetBytes(javaText ?? string.Empty);

            using (var stringreader = new MemoryStream(textBytes))
            using (var wrapper = new ikvm.io.InputStreamWrapper(stringreader))
            {
                options.ConversionStateChanged(ConversionState.ParsingJavaAST);

                var parsed = JavaParser.parse(wrapper);

                options.ConversionStateChanged(ConversionState.BuildingCSharpAST);

                var types = parsed.getTypes().ToList<TypeDeclaration>();
                var imports = parsed.getImports();
                if (imports == null || imports.isEmpty())
                    options.IncludeUsings = false;

                var package = parsed.getPackage();
                if (package == null)
                    options.IncludeNamespace = false;

                var usings = new List<UsingDirectiveSyntax>();

                //foreach (var import in imports)
                //{
                //    var usingSyntax = Syntax.UsingDirective(Syntax.ParseName(import.getName().toString()));
                //    usings.Add(usingSyntax);
                //}

                if (options.IncludeUsings)
                {
                    foreach (var ns in options.Usings)
                    {
                        var usingSyntax = Syntax.UsingDirective(Syntax.ParseName(ns));
                        usings.Add(usingSyntax);
                    }
                }

                var rootMembers = new List<MemberDeclarationSyntax>();
                NamespaceDeclarationSyntax namespaceSyntax = null;

                if (options.IncludeNamespace)
                {
                    string packageName = package.getName().toString();

                    foreach (var packageReplacement in options.PackageReplacements)
                    {
                        packageName = packageReplacement.Replace(packageName);
                    }

                    packageName = TypeHelper.Capitalize(packageName);

                    namespaceSyntax = Syntax.NamespaceDeclaration(Syntax.ParseName(packageName));
                }

                foreach (var type in types)
                {
                    if (type is ClassOrInterfaceDeclaration)
                    {
                        var classOrIntType = type as ClassOrInterfaceDeclaration;

                        if (classOrIntType.isInterface())
                        {
                            var interfaceSyntax = VisitInterfaceDeclaration(context, classOrIntType, false);

                            if (options.IncludeNamespace)
                                namespaceSyntax = namespaceSyntax.AddMembers(interfaceSyntax);
                            else if (options.IncludeClass)
                                rootMembers.Add(interfaceSyntax);
                        }
                        else
                        {
                            var classSyntax = VisitClassDeclaration(context, classOrIntType, false);

                            if (options.IncludeNamespace)
                                namespaceSyntax = namespaceSyntax.AddMembers(classSyntax);
                            else if (options.IncludeClass)
                                rootMembers.Add(classSyntax);
                            else
                                rootMembers.AddRange(classSyntax.Members.OfType<MethodDeclarationSyntax>());
                        }
                    }
                }

                if (options.IncludeNamespace)
                    rootMembers.Add(namespaceSyntax);

                var root = Syntax.CompilationUnit(
                    externs: null,
                    usings: Syntax.List(usings.ToArray()),
                    attributeLists: null,
                    members: Syntax.List<MemberDeclarationSyntax>(rootMembers))
                    .NormalizeWhitespace();

                var tree = SyntaxTree.Create(root);

                options.ConversionStateChanged(ConversionState.Done);

                return tree.GetText().ToString();
            }
        }
예제 #7
0
        public static string ConvertText(string javaText, JavaConversionOptions options = null)
        {
            if (options == null)
            {
                options = new JavaConversionOptions();
            }

            options.ConversionStateChanged(ConversionState.Starting);

            var context = new ConversionContext(options);

            var textBytes = Encoding.UTF8.GetBytes(javaText ?? string.Empty);

            using (var stringreader = new MemoryStream(textBytes))
                using (var wrapper = new ikvm.io.InputStreamWrapper(stringreader))
                {
                    options.ConversionStateChanged(ConversionState.ParsingJavaAST);

                    var parsed = JavaParser.parse(wrapper);

                    options.ConversionStateChanged(ConversionState.BuildingCSharpAST);

                    var types   = parsed.getTypes().ToList <TypeDeclaration>();
                    var imports = parsed.getImports().ToList <ImportDeclaration>();
                    var package = parsed.getPackage();

                    var usings = new List <UsingDirectiveSyntax>();

                    //foreach (var import in imports)
                    //{
                    //    var usingSyntax = SyntaxFactory.UsingDirective(SyntaxFactory.ParseName(import.getName().toString()));
                    //    usings.Add(usingSyntax);
                    //}

                    if (options.IncludeUsings)
                    {
                        foreach (var ns in options.Usings.Where(x => !string.IsNullOrWhiteSpace(x)))
                        {
                            var usingSyntax = SyntaxFactory.UsingDirective(SyntaxFactory.ParseName(ns));
                            usings.Add(usingSyntax);
                        }
                    }

                    var rootMembers = new List <MemberDeclarationSyntax>();
                    NamespaceDeclarationSyntax namespaceSyntax = null;

                    if (options.IncludeNamespace)
                    {
                        string packageName = package.getName().toString();

                        foreach (var packageReplacement in options.PackageReplacements)
                        {
                            packageName = packageReplacement.Replace(packageName);
                        }

                        packageName = TypeHelper.Capitalize(packageName);

                        namespaceSyntax = SyntaxFactory.NamespaceDeclaration(SyntaxFactory.ParseName(packageName));
                    }

                    foreach (var type in types)
                    {
                        if (type is ClassOrInterfaceDeclaration)
                        {
                            var classOrIntType = type as ClassOrInterfaceDeclaration;

                            if (classOrIntType.isInterface())
                            {
                                var interfaceSyntax = VisitInterfaceDeclaration(context, classOrIntType, false);

                                if (options.IncludeNamespace)
                                {
                                    namespaceSyntax = namespaceSyntax.AddMembers(interfaceSyntax);
                                }
                                else
                                {
                                    rootMembers.Add(interfaceSyntax);
                                }
                            }
                            else
                            {
                                var classSyntax = VisitClassDeclaration(context, classOrIntType, false);

                                if (options.IncludeNamespace)
                                {
                                    namespaceSyntax = namespaceSyntax.AddMembers(classSyntax);
                                }
                                else
                                {
                                    rootMembers.Add(classSyntax);
                                }
                            }
                        }
                    }

                    if (options.IncludeNamespace)
                    {
                        rootMembers.Add(namespaceSyntax);
                    }

                    var root = SyntaxFactory.CompilationUnit(
                        externs: new SyntaxList <ExternAliasDirectiveSyntax>(),
                        usings: SyntaxFactory.List(usings.ToArray()),
                        attributeLists: new SyntaxList <AttributeListSyntax>(),
                        members: SyntaxFactory.List <MemberDeclarationSyntax>(rootMembers))
                               .NormalizeWhitespace();

                    var tree = SyntaxFactory.SyntaxTree(root);

                    options.ConversionStateChanged(ConversionState.Done);

                    return(tree.GetText().ToString());
                }
        }
예제 #8
0
        public static string GetContent(MediaItem oMediaItem)
        {
            string     sResult = string.Empty;
            PDDocument doc     = null;

            ikvm.io.InputStreamWrapper wrapper = null;
            if (oMediaItem != null && oMediaItem.Extension == "pdf")
            {
                try
                {
                    //Stream stream = oMediaItem.GetMediaStream();
                    //MemoryStream ms = new MemoryStream();
                    //stream.CopyTo(ms);
                    //ByteArrayInputStream oByteArrayStream = new ByteArrayInputStream(ms.GetBuffer());
                    //ms.Close();
                    //stream.Close();

                    //doc = PDDocument.load(oByteArrayStream);
                    //PDFTextStripper stripper = new PDFTextStripper();
                    //sResult = stripper.getText(doc);

                    /*
                     * Using above code, will generate several exceptions on content retrieval, all with the same error message, example:
                     * 23864 18:28:32 ERROR PDF GetContent: Could not extract from mediaItem: {57F2C94F-1204-4975-BABE-287B167F07AF}, media item path: [/sitecore/media library/Files/Investments/AssetMark/Monthly Performance/09-2013/GFWM_3301_MonthlyPerformanceAR_2013_09_C11582pdf?sc_database=web]
                     *  Exception: java.io.IOException
                     *  Message: Push back buffer is full
                     *  Source: pdfbox-1.6.0
                     *     at org.apache.pdfbox.pdfparser.PDFParser.parse()
                     *     at org.apache.pdfbox.pdmodel.PDDocument.load(InputStream input, RandomAccess scratchFile)
                     *     at org.apache.pdfbox.pdmodel.PDDocument.load(InputStream input)
                     *     at Genworth.SitecoreExt.Helpers.PDFExtractorHelper.GetContent(MediaItem oMediaItem)
                     *
                     * According to this web reference (https://issues.apache.org/jira/browse/PDFBOX-1818), the issue has been fixed on 1.8.4 and 2.0.0 versions. Current PDFbox version 1.6.0.
                     *
                     * However, code below does not bring the issues as in above's code.
                     */

                    wrapper = new ikvm.io.InputStreamWrapper(oMediaItem.GetMediaStream());
                    doc     = PDDocument.load(wrapper);
                    sResult = new PDFTextStripper().getText(doc);
                    int contentLength = sResult != null ? sResult.Length : 0;
                    Sitecore.Diagnostics.Log.Info("PDF Content extraction successful, media item ID: " + oMediaItem.ID.ToString() + ", content length: " + contentLength, typeof(PDFExtractorHelper));
                }
                catch (Exception Ex)
                {
                    Sitecore.Diagnostics.Log.Error("PDF GetContent: Could not extract from mediaItem: " + oMediaItem.ID.ToString() + ", media item path: [" + oMediaItem.Path + "]", Ex, typeof(PDFExtractorHelper));
                }
                finally
                {
                    if (doc != null)
                    {
                        doc.close();
                    }

                    if (wrapper != null)
                    {
                        wrapper.close();
                    }
                }
            }
            return(sResult);
        }