Exemple #1
0
        /// <summary>This will do the escaping on an input file.</summary>
        /// <remarks>
        /// This will do the escaping on an input file. Input file should already be tokenized,
        /// with tokens separated by whitespace. <br />
        /// Usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl
        /// </remarks>
        /// <param name="args">Command line argument: a file or URL</param>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Out.WriteLine("usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl");
                return;
            }
            string filename = args[0];

            try
            {
                IDocument <string, Word, Word> d;
                // initialized below
                if (filename.StartsWith("http://"))
                {
                    IDocument <string, Word, Word> dpre = new BasicDocument <string>(WhitespaceTokenizer.Factory()).Init(new URL(filename));
                    IDocumentProcessor <Word, Word, string, Word> notags = new StripTagsProcessor <string, Word>();
                    d = notags.ProcessDocument(dpre);
                }
                else
                {
                    d = new BasicDocument <string>(WhitespaceTokenizer.Factory()).Init(new File(filename));
                }
                IDocumentProcessor <Word, IHasWord, string, Word> proc = new Edu.Stanford.Nlp.Process.PTBEscapingProcessor <Word, string, Word>();
                IDocument <string, Word, IHasWord> newD = proc.ProcessDocument(d);
                foreach (IHasWord word in newD)
                {
                    System.Console.Out.WriteLine(word);
                }
            }
            catch (Exception e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <summary>For internal debugging purposes only.</summary>
        public static void Main(string[] args)
        {
            new BasicDocument <string>();
            IDocument <string, Word, Word> htmlDoc = BasicDocument.Init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");

            System.Console.Out.WriteLine("Before:");
            System.Console.Out.WriteLine(htmlDoc);
            IDocument <string, Word, Word> txtDoc = new Edu.Stanford.Nlp.Process.StripTagsProcessor <string, Word>(true).ProcessDocument(htmlDoc);

            System.Console.Out.WriteLine("After:");
            System.Console.Out.WriteLine(txtDoc);
            IDocument <string, Word, IList <Word> > sentences = new WordToSentenceProcessor <Word>().ProcessDocument(txtDoc);

            System.Console.Out.WriteLine("Sentences:");
            System.Console.Out.WriteLine(sentences);
        }
Exemple #3
0
        /// <summary>This will print out some text, recognizing tags.</summary>
        /// <remarks>
        /// This will print out some text, recognizing tags.  It can be used to
        /// test tag breaking.  <br />  Usage: <code>
        /// java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl
        /// </code>
        /// </remarks>
        /// <param name="args">Command line argument: a file or URL</param>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Out.WriteLine("usage: java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl");
                System.Environment.Exit(0);
            }
            string filename = args[0];

            try
            {
                IDocument <IHasWord, Word, Word> d;
                if (filename.StartsWith("http://"))
                {
                    IDocument <IHasWord, Word, Word> dpre = new BasicDocument <IHasWord>().Init(new URL(filename));
                    IDocumentProcessor <Word, Word, IHasWord, Word> notags = new StripTagsProcessor <IHasWord, Word>();
                    d = notags.ProcessDocument(dpre);
                }
                else
                {
                    d = new BasicDocument <IHasWord>().Init(new File(filename));
                }
                IDocumentProcessor <Word, IHasWord, IHasWord, Word> proc = new Edu.Stanford.Nlp.Process.WordToTaggedWordProcessor <Word, IHasWord, Word>();
                IDocument <IHasWord, Word, IHasWord> sentd = proc.ProcessDocument(d);
                // System.out.println(sentd);
                int i = 0;
                foreach (IHasWord w in sentd)
                {
                    System.Console.Out.WriteLine(i + ": " + w);
                    i++;
                }
            }
            catch (Exception e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <summary>Loads a text or html file from a file path or URL.</summary>
        /// <remarks>
        /// Loads a text or html file from a file path or URL.  Treats anything
        /// beginning with <tt>http:\\</tt>,<tt>.htm</tt>, or <tt>.html</tt> as an
        /// html file, and strips all tags from the document
        /// </remarks>
        public virtual void LoadFile(string filename)
        {
            if (filename == null)
            {
                return;
            }
            File   file      = new File(filename);
            string urlOrFile = filename;

            // if file can't be found locally, try prepending http:// and looking on web
            if (!file.Exists() && filename.IndexOf("://") == -1)
            {
                urlOrFile = "http://" + filename;
            }
            else
            {
                // else prepend file:// to handle local html file urls
                if (filename.IndexOf("://") == -1)
                {
                    urlOrFile = "file://" + filename;
                }
            }
            // TODO: why do any of this instead of just reading the file?  THIS SHOULD BE UPDATED FOR 2017!
            // Also, is this working correctly still?
            // load the document
            IDocument <object, Word, Word> doc;

            try
            {
                if (urlOrFile.StartsWith("http://") || urlOrFile.EndsWith(".htm") || urlOrFile.EndsWith(".html"))
                {
                    // strip tags from html documents
                    IDocument <object, Word, Word> docPre = new BasicDocument <object>().Init(new URL(urlOrFile));
                    IDocumentProcessor <Word, Word, object, Word> noTags = new StripTagsProcessor <object, Word>();
                    doc = noTags.ProcessDocument(docPre);
                }
                else
                {
                    doc = new BasicDocument <object>(Edu.Stanford.Nlp.Parser.UI.ParserPanel.GetTokenizerFactory()).Init(new InputStreamReader(new FileInputStream(filename), encoding));
                }
            }
            catch (Exception e)
            {
                JOptionPane.ShowMessageDialog(this, "Could not load file " + filename + "\n" + e, null, JOptionPane.ErrorMessage);
                Sharpen.Runtime.PrintStackTrace(e);
                SetStatus("Error loading document");
                return;
            }
            // load the document into the text pane
            StringBuilder docStr = new StringBuilder();

            foreach (Word aDoc in doc)
            {
                if (docStr.Length > 0)
                {
                    docStr.Append(' ');
                }
                docStr.Append(aDoc.ToString());
            }
            textPane.SetText(docStr.ToString());
            dataFileLabel.SetText(urlOrFile);
            HighlightSentence(0);
            forwardButton.SetEnabled(endIndex != textPane.GetText().Length - 1);
            // scroll to top of document
            textPane.SetCaretPosition(0);
            SetStatus("Done");
        }
Exemple #5
0
 public static Document FromBasic(BasicDocument basicDocument)
 {
     return(new Document(basicDocument));
 }
Exemple #6
0
 private Document(BasicDocument document) : base(document)
 {
 }
 public static void ShouldHaveOneResultMatching(this List <BasicDocument> result, BasicDocument expected)
 {
     result.Should().HaveCount(1);
     result.First().Id.Should().Be(expected.Id);
 }