/// <summary>This will do the escaping on an input file.</summary> /// <remarks> /// This will do the escaping on an input file. Input file should already be tokenized, /// with tokens separated by whitespace. <br /> /// Usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl /// </remarks> /// <param name="args">Command line argument: a file or URL</param> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Out.WriteLine("usage: java edu.stanford.nlp.process.PTBEscapingProcessor fileOrUrl"); return; } string filename = args[0]; try { IDocument <string, Word, Word> d; // initialized below if (filename.StartsWith("http://")) { IDocument <string, Word, Word> dpre = new BasicDocument <string>(WhitespaceTokenizer.Factory()).Init(new URL(filename)); IDocumentProcessor <Word, Word, string, Word> notags = new StripTagsProcessor <string, Word>(); d = notags.ProcessDocument(dpre); } else { d = new BasicDocument <string>(WhitespaceTokenizer.Factory()).Init(new File(filename)); } IDocumentProcessor <Word, IHasWord, string, Word> proc = new Edu.Stanford.Nlp.Process.PTBEscapingProcessor <Word, string, Word>(); IDocument <string, Word, IHasWord> newD = proc.ProcessDocument(d); foreach (IHasWord word in newD) { System.Console.Out.WriteLine(word); } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>For internal debugging purposes only.</summary> public static void Main(string[] args) { new BasicDocument <string>(); IDocument <string, Word, Word> htmlDoc = BasicDocument.Init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.Console.Out.WriteLine("Before:"); System.Console.Out.WriteLine(htmlDoc); IDocument <string, Word, Word> txtDoc = new Edu.Stanford.Nlp.Process.StripTagsProcessor <string, Word>(true).ProcessDocument(htmlDoc); System.Console.Out.WriteLine("After:"); System.Console.Out.WriteLine(txtDoc); IDocument <string, Word, IList <Word> > sentences = new WordToSentenceProcessor <Word>().ProcessDocument(txtDoc); System.Console.Out.WriteLine("Sentences:"); System.Console.Out.WriteLine(sentences); }
/// <summary>This will print out some text, recognizing tags.</summary> /// <remarks> /// This will print out some text, recognizing tags. It can be used to /// test tag breaking. <br /> Usage: <code> /// java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl /// </code> /// </remarks> /// <param name="args">Command line argument: a file or URL</param> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Out.WriteLine("usage: java edu.stanford.nlp.process.WordToTaggedWordProcessor fileOrUrl"); System.Environment.Exit(0); } string filename = args[0]; try { IDocument <IHasWord, Word, Word> d; if (filename.StartsWith("http://")) { IDocument <IHasWord, Word, Word> dpre = new BasicDocument <IHasWord>().Init(new URL(filename)); IDocumentProcessor <Word, Word, IHasWord, Word> notags = new StripTagsProcessor <IHasWord, Word>(); d = notags.ProcessDocument(dpre); } else { d = new BasicDocument <IHasWord>().Init(new File(filename)); } IDocumentProcessor <Word, IHasWord, IHasWord, Word> proc = new Edu.Stanford.Nlp.Process.WordToTaggedWordProcessor <Word, IHasWord, Word>(); IDocument <IHasWord, Word, IHasWord> sentd = proc.ProcessDocument(d); // System.out.println(sentd); int i = 0; foreach (IHasWord w in sentd) { System.Console.Out.WriteLine(i + ": " + w); i++; } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>Loads a text or html file from a file path or URL.</summary> /// <remarks> /// Loads a text or html file from a file path or URL. Treats anything /// beginning with <tt>http:\\</tt>,<tt>.htm</tt>, or <tt>.html</tt> as an /// html file, and strips all tags from the document /// </remarks> public virtual void LoadFile(string filename) { if (filename == null) { return; } File file = new File(filename); string urlOrFile = filename; // if file can't be found locally, try prepending http:// and looking on web if (!file.Exists() && filename.IndexOf("://") == -1) { urlOrFile = "http://" + filename; } else { // else prepend file:// to handle local html file urls if (filename.IndexOf("://") == -1) { urlOrFile = "file://" + filename; } } // TODO: why do any of this instead of just reading the file? THIS SHOULD BE UPDATED FOR 2017! // Also, is this working correctly still? // load the document IDocument <object, Word, Word> doc; try { if (urlOrFile.StartsWith("http://") || urlOrFile.EndsWith(".htm") || urlOrFile.EndsWith(".html")) { // strip tags from html documents IDocument <object, Word, Word> docPre = new BasicDocument <object>().Init(new URL(urlOrFile)); IDocumentProcessor <Word, Word, object, Word> noTags = new StripTagsProcessor <object, Word>(); doc = noTags.ProcessDocument(docPre); } else { doc = new BasicDocument <object>(Edu.Stanford.Nlp.Parser.UI.ParserPanel.GetTokenizerFactory()).Init(new InputStreamReader(new FileInputStream(filename), encoding)); } } catch (Exception e) { JOptionPane.ShowMessageDialog(this, "Could not load file " + filename + "\n" + e, null, JOptionPane.ErrorMessage); Sharpen.Runtime.PrintStackTrace(e); SetStatus("Error loading document"); return; } // load the document into the text pane StringBuilder docStr = new StringBuilder(); foreach (Word aDoc in doc) { if (docStr.Length > 0) { docStr.Append(' '); } docStr.Append(aDoc.ToString()); } textPane.SetText(docStr.ToString()); dataFileLabel.SetText(urlOrFile); HighlightSentence(0); forwardButton.SetEnabled(endIndex != textPane.GetText().Length - 1); // scroll to top of document textPane.SetCaretPosition(0); SetStatus("Done"); }
public static Document FromBasic(BasicDocument basicDocument) { return(new Document(basicDocument)); }
private Document(BasicDocument document) : base(document) { }
public static void ShouldHaveOneResultMatching(this List <BasicDocument> result, BasicDocument expected) { result.Should().HaveCount(1); result.First().Id.Should().Be(expected.Id); }