internal IndexedFieldLookup( IEnumerable <IFieldTokenizationOptions> fieldTokenizationOptions, ITokenizerFactory tokenizerFactory, TokenizationOptions defaultTokenizationOptions) { if (fieldTokenizationOptions is null) { throw new ArgumentNullException(nameof(fieldTokenizationOptions)); } if (tokenizerFactory is null) { throw new ArgumentNullException(nameof(tokenizerFactory)); } if (defaultTokenizationOptions is null) { throw new ArgumentNullException(nameof(defaultTokenizationOptions)); } foreach (var field in fieldTokenizationOptions) { this.RegisterField(field, tokenizerFactory, defaultTokenizationOptions); } }
/// <summary> /// Saves the results of applying the parser to the current text to /// the specified filename. /// </summary> public virtual void SaveOutput(string filename) { if (filename == null || filename.Equals(string.Empty)) { return; } string text = textPane.GetText(); StringReader reader = new StringReader(text); DocumentPreprocessor processor = new DocumentPreprocessor(reader); ITokenizerFactory <IHasWord> tf = tlp.GetTokenizerFactory(); processor.SetTokenizerFactory(tf); IList <IList <IHasWord> > sentences = new List <IList <IHasWord> >(); foreach (IList <IHasWord> sentence in processor) { sentences.Add(sentence); } JProgressBar progress = new JProgressBar(0, sentences.Count); JButton cancel = new JButton(); JDialog dialog = new JDialog(new Frame(), "Parser Progress", true); dialog.SetSize(300, 150); dialog.Add(BorderLayout.North, new JLabel("Parsing " + sentences.Count + " sentences")); dialog.Add(BorderLayout.Center, progress); dialog.Add(BorderLayout.South, cancel); //dialog.add(progress); ParserPanel.SaveOutputThread thread = new ParserPanel.SaveOutputThread(this, filename, progress, dialog, cancel, sentences); cancel.SetText("Cancel"); cancel.SetToolTipText("Cancel"); cancel.AddActionListener(null); thread.Start(); dialog.SetVisible(true); }
/// <summary> /// Returns a factory for FrenchTokenizer that replicates the tokenization of /// Green, de Marneffe, and Manning (2011). /// </summary> public static ITokenizerFactory <CoreLabel> FtbFactory() { ITokenizerFactory <CoreLabel> tf = FrenchTokenizer.FrenchTokenizerFactory.NewTokenizerFactory(); tf.SetOptions(FtbOptions); return(tf); }
/// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard French. /// Performs punctuation splitting and light tokenization by default. /// <p> /// Currently, this tokenizer does not do line splitting. It assumes that the input /// file is delimited by the system line separator. The output will be equivalently /// delimited. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Lexer options ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory(); string orthoOptions = options.GetProperty("options", string.Empty); // When called from this main method, split on newline. No options for // more granular sentence splitting. orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; tf.SetOptions(orthoOptions); // Other options string encoding = options.GetProperty("encoding", "UTF-8"); bool toLower = PropertiesUtils.GetBool(options, "lowerCase", false); // Read the file from stdin int nLines = 0; int nTokens = 0; long startTime = Runtime.NanoTime(); try { ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(FrenchLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } string outputToken = toLower ? word.ToLower(Locale.French) : word; System.Console.Out.Write(outputToken); printSpace = true; } } } catch (UnsupportedEncodingException e) { log.Error(e); } long elapsedTime = Runtime.NanoTime() - startTime; double linesPerSec = (double)nLines / (elapsedTime / 1e9); System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec); }
/// <summary>Creates an ArabicTokenizer.</summary> /// <remarks> /// Creates an ArabicTokenizer. The default tokenizer /// is ArabicTokenizer.atbFactory(), which produces the /// same orthographic normalization as Green and Manning (2010). /// </remarks> /// <returns>A TokenizerFactory that produces each Arabic token as a CoreLabel</returns> private ITokenizerFactory <CoreLabel> GetTokenizerFactory() { ITokenizerFactory <CoreLabel> tokFactory = null; if (!isTokenized) { if (tokenizerOptions == null) { tokFactory = ArabicTokenizer.AtbFactory(); string atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening"; tokFactory.SetOptions(atbVocOptions); } else { if (tokenizerOptions.Contains("removeSegMarker")) { throw new Exception("Option 'removeSegMarker' cannot be used with ArabicSegmenter"); } tokFactory = ArabicTokenizer.Factory(); tokFactory.SetOptions(tokenizerOptions); } log.Info("Loaded ArabicTokenizer with options: " + tokenizerOptions); } return(tokFactory); }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { if (args.Length != 2) { log.Info("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); ITokenizerFactory <CoreLabel> ptbTokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.SetTokenizerFactory(ptbTokenizerFactory); foreach (IList <IHasWord> sentence in documentPreprocessor) { IList <TaggedWord> tSentence = tagger.TagSentence(sentence); pw.Println(SentenceUtils.ListToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. IList <IHasWord> sent = SentenceUtils.ToWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); IList <TaggedWord> taggedSent = tagger.TagSentence(sent); foreach (TaggedWord tw in taggedSent) { if (tw.Tag().StartsWith("JJ")) { pw.Println(tw.Word()); } } pw.Close(); }
// end static class SpanishTokenizerFactory /// <summary>Returns a tokenizer with Ancora tokenization.</summary> public static ITokenizerFactory <CoreLabel> AncoraFactory() { ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.SpanishTokenizerFactory.NewCoreLabelTokenizerFactory(); tf.SetOptions(AncoraOptions); return(tf); }
/// <summary> /// demoAPI demonstrates other ways of calling the parser with /// already tokenized text, or in some cases, raw text that needs to /// be tokenized as a single sentence. /// </summary> /// <remarks> /// demoAPI demonstrates other ways of calling the parser with /// already tokenized text, or in some cases, raw text that needs to /// be tokenized as a single sentence. Output is handled with a /// TreePrint object. Note that the options used when creating the /// TreePrint can determine what results to print out. Once again, /// one can capture the output by passing a PrintWriter to /// TreePrint.printTree. This code is for English. /// </remarks> public static void DemoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words string[] sent = new string[] { "This", "is", "an", "easy", "sentence", "." }; IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent); Tree parse = lp.Apply(rawWords); parse.PennPrint(); System.Console.Out.WriteLine(); // This option shows loading and using an explicit tokenizer string sent2 = "This is another sentence."; ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty); ITokenizer <CoreLabel> tok = tokenizerFactory.GetTokenizer(new StringReader(sent2)); IList <CoreLabel> rawWords2 = tok.Tokenize(); parse = lp.Apply(rawWords2); ITreebankLanguagePack tlp = lp.TreebankLanguagePack(); // PennTreebankLanguagePack for English IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory(); GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.PrintTree(parse); }
/// <summary>Tokenize the text using the parser's tokenizer</summary> public virtual IList <IHasWord> Tokenize(string sentence) { ITokenizerFactory <IHasWord> tf = TreebankLanguagePack().GetTokenizerFactory(); ITokenizer <IHasWord> tokenizer = tf.GetTokenizer(new StringReader(sentence)); IList <IHasWord> tokens = tokenizer.Tokenize(); return(tokens); }
public MarkdownToHtmlRenderer(MarkdownParser parser, ITokenizerFactory <IMdToken> tokenizer, INodeRenderer nodeRenderer) { Parser = parser; Tokenizer = tokenizer; NodeRenderer = nodeRenderer; Modificators = new List <INodeVisitor>(); }
/// <summary> /// Constructs a new DocumentReader that will read text from the given /// Reader and tokenize it into words using the given Tokenizer. /// </summary> /// <remarks> /// Constructs a new DocumentReader that will read text from the given /// Reader and tokenize it into words using the given Tokenizer. The default /// implementation will internally buffer the reader if it is not already /// buffered, so there is no need to pre-wrap the reader with a BufferedReader. /// This class provides many <tt>getReader</tt> methods for conviniently /// reading from many input sources. /// </remarks> public DocumentReader(Reader @in, ITokenizerFactory <IHasWord> tokenizerFactory, bool keepOriginalText) { if (@in != null) { SetReader(@in); } SetTokenizerFactory(tokenizerFactory); this.keepOriginalText = keepOriginalText; }
/// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding). /// Performs punctuation splitting and light tokenization by default. /// Orthographic normalization options are available, and can be enabled with /// command line options. /// <p> /// Currently, this tokenizer does not do line splitting. It normalizes non-printing /// line separators across platforms and prints the system default line splitter /// to the output. /// <p> /// The following normalization options are provided: /// <ul> /// <li> /// <c>useUTF8Ellipsis</c> /// : Replaces sequences of three or more full stops with \u2026</li> /// <li> /// <c>normArDigits</c> /// : Convert Arabic digits to ASCII equivalents</li> /// <li> /// <c>normArPunc</c> /// : Convert Arabic punctuation to ASCII equivalents</li> /// <li> /// <c>normAlif</c> /// : Change all alif forms to bare alif</li> /// <li> /// <c>normYa</c> /// : Map ya to alif maqsura</li> /// <li> /// <c>removeDiacritics</c> /// : Strip all diacritics</li> /// <li> /// <c>removeTatweel</c> /// : Strip tatweel elongation character</li> /// <li> /// <c>removeQuranChars</c> /// : Remove diacritics that appear in the Quran</li> /// <li> /// <c>removeProMarker</c> /// : Remove the ATB null pronoun marker</li> /// <li> /// <c>removeSegMarker</c> /// : Remove the ATB clitic segmentation marker</li> /// <li> /// <c>removeMorphMarker</c> /// : Remove the ATB morpheme boundary markers</li> /// <li> /// <c>removeLengthening</c> /// : Replace all sequences of three or more identical (non-period) characters with one copy</li> /// <li> /// <c>atbEscaping</c> /// : Replace left/right parentheses with ATB escape characters</li> /// </ul> /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length > 0 && args[0].Contains("help")) { System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName); System.Console.Error.Printf("%nOptions:%n"); log.Info(" -help : Print this message. See javadocs for all normalization options."); log.Info(" -atb : Tokenization for the parsing experiments in Green and Manning (2010)"); System.Environment.Exit(-1); } // Process normalization options Properties tokenizerOptions = StringUtils.ArgsToProperties(args); ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory(); foreach (string option in tokenizerOptions.StringPropertyNames()) { tf.SetOptions(option); } // Replace line separators with a token so that we can // count lines tf.SetOptions("tokenizeNLs"); // Read the file int nLines = 0; int nTokens = 0; try { string encoding = "UTF-8"; ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(ArabicLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } System.Console.Out.Write(word); printSpace = true; } } } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens); }
public static ITokenizerFactory <CoreLabel> AtbFactory() { ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.ArabicTokenizerFactory.NewTokenizerFactory(); foreach (string option in atbOptions.StringPropertyNames()) { tf.SetOptions(option); } return(tf); }
/// <exception cref="System.Exception"/> public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) : base(dict, semantics) { string fileName = props.GetProperty(Constants.MucProp); fileContents = IOUtils.SlurpFile(fileName); currentOffset = 0; tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(false), string.Empty); stanfordProcessor = LoadStanfordProcessor(props); }
/// <param name="hasSegMarkers">if true, input has segmentation markers</param> /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param> /// <param name="hasDomainLabels"> /// if true, input has a whitespace-terminated domain at the beginning /// of each line of text /// </param> /// <param name="stripRewrites"> /// if true, erase orthographical rewrites from the gold labels (for /// comparison purposes) /// </param> /// <param name="tokFactory">a TokenizerFactory for the input</param> public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, bool hasDomainLabels, string domain, bool stripRewrites, ITokenizerFactory <CoreLabel> tokFactory) { tf = tokFactory; inputHasTags = hasTags; inputHasDomainLabels = hasDomainLabels; inputDomain = domain; shouldStripRewrites = stripRewrites; segMarker = hasSegMarkers ? DefaultSegMarker : null; factory = LineIterator.GetFactory(new _ISerializableFunction_131(this)); }
/// <summary> /// Replaces the default <see cref="ITokenizerFactory"/> implementation. /// </summary> public FullTextIndexBuilder <TKey> WithTokenizerFactory(ITokenizerFactory tokenizerFactory) { if (tokenizerFactory is null) { throw new ArgumentNullException(nameof(tokenizerFactory)); } this.tokenizerFactory = tokenizerFactory; return(this); }
/// <summary>Make an Arabic Segmenter.</summary> /// <param name="props"> /// Options for how to tokenize. See the main method of /// <see cref="ArabicTokenizer{T}"/> /// for details /// </param> public ArabicSegmenter(Properties props) { /* Serializable */ // SEGMENTER OPTIONS (can be set in the Properties object // passed to the constructor). // The input already been tokenized. Do not run the Arabic tokenizer. // Tokenizer options // Mark segmented prefixes with this String // Mark segmented suffixes with this String // Number of decoding threads // Write TedEval files // Use a custom feature factory // Training and evaluation files have domain labels // Training and evaluation text are all in the same domain (default:atb) // Ignore rewrites (training only, produces a model that then can be used to do // no-rewrite segmentation) // Use the original feature set which doesn't contain start-and-end "wrapper" features isTokenized = props.Contains(optTokenized); tokenizerOptions = props.GetProperty(optTokenizer, null); tedEvalPrefix = props.GetProperty(optTedEval, null); hasDomainLabels = props.Contains(optWithDomains); domain = props.GetProperty(optDomain, "atb"); noRewrites = props.Contains(optNoRewrites); tf = GetTokenizerFactory(); prefixMarker = props.GetProperty(optPrefix, string.Empty); suffixMarker = props.GetProperty(optSuffix, string.Empty); if (props.Contains(optLocalFeaturesOnly)) { if (props.Contains(optFeatureFactory)) { throw new Exception("Cannot use custom feature factory with localFeaturesOnly flag--" + "have your custom feature factory extend ArabicSegmenterFeatureFactory instead of " + "StartAndEndArabicSegmenterFeatureFactory and remove the localFeaturesOnly flag." ); } props.SetProperty(optFeatureFactory, localOnlyFeatureFactory); } if (!props.Contains(optFeatureFactory)) { props.SetProperty(optFeatureFactory, defaultFeatureFactory); } // Remove all command-line properties that are specific to ArabicSegmenter props.Remove(optTokenizer); props.Remove(optTokenized); props.Remove(optPrefix); props.Remove(optSuffix); props.Remove(optThreads); props.Remove(optTedEval); props.Remove(optWithDomains); props.Remove(optDomain); props.Remove(optNoRewrites); props.Remove(optLocalFeaturesOnly); flags = new SeqClassifierFlags(props); classifier = new CRFClassifier <CoreLabel>(flags); }
public virtual void RunTest <_T0>(ITokenizerFactory <_T0> factory, string[] testStrings, string[][] resultsStrings) where _T0 : IHasWord { for (int i = 0; i < testStrings.Length; ++i) { ITokenizer <IHasWord> tokenizer = factory.GetTokenizer(new StringReader(testStrings[i])); IList <IHasWord> tokens = tokenizer.Tokenize(); NUnit.Framework.Assert.AreEqual(resultsStrings[i].Length, tokens.Count); for (int j = 0; j < resultsStrings[i].Length; ++j) { NUnit.Framework.Assert.AreEqual(resultsStrings[i][j], tokens[j].Word()); } } }
/// <summary>Copy constructor.</summary> /// <param name="other"/> public ArabicSegmenter(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter other) { isTokenized = other.isTokenized; tokenizerOptions = other.tokenizerOptions; prefixMarker = other.prefixMarker; suffixMarker = other.suffixMarker; tedEvalPrefix = other.tedEvalPrefix; hasDomainLabels = other.hasDomainLabels; domain = other.domain; noRewrites = other.noRewrites; flags = other.flags; // ArabicTokenizerFactory is *not* threadsafe. Make a new copy. tf = GetTokenizerFactory(); // CRFClassifier is threadsafe, so return a reference. classifier = other.classifier; }
public FullTextIndex( FullTextIndexConfiguration <TKey> options, IIndexNodeFactory indexNodeFactory = null, ITokenizerFactory tokenizerFactory = null, IQueryParser queryParser = null) { this.indexNodeFactory = indexNodeFactory ?? new IndexNodeFactory(); this.tokenizerFactory = tokenizerFactory ?? new TokenizerFactory(); this.queryParser = queryParser ?? new QueryParser(); this.indexNodeFactory.Configure(options); this.IdPool = new IdPool <TKey>(); this.FieldLookup = new IndexedFieldLookup(); this.Root = this.indexNodeFactory.CreateNode(); }
public virtual void TestArabicTokenizer() { System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length)); ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory(); tf.SetOptions("removeProMarker"); tf.SetOptions("removeSegMarker"); tf.SetOptions("removeMorphMarker"); for (int i = 0; i < untokInputs.Length; ++i) { string line = untokInputs[i]; ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line)); IList <CoreLabel> tokens = tokenizer.Tokenize(); string tokenizedLine = SentenceUtils.ListToString(tokens); string reference = tokReferences[i]; NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine); } }
public virtual void TestCharOffsets() { string untokInput = "إِنَّ- -نا هادِئ+ُونَ ."; int[] beginOffsets = new int[] { 0, 7, 11, 22 }; int[] endOffsets = new int[] { 6, 10, 21, 23 }; ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory(); tf.SetOptions("removeProMarker"); tf.SetOptions("removeSegMarker"); tf.SetOptions("removeMorphMarker"); ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput)); IList <CoreLabel> tokens = tokenizer.Tokenize(); NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length); for (int i = 0; i < beginOffsets.Length; i++) { NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition()); NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition()); } }
private void RegisterField(IFieldTokenizationOptions fieldOptions, ITokenizerFactory tokenizerFactory, TokenizationOptions defaultTokenizationOptions) { var fieldName = fieldOptions.Name; if (this.fieldToDetailsLookup.ContainsKey(fieldOptions.Name)) { throw new LiftiException(ExceptionMessages.FieldNameAlreadyUsed, fieldName); } var newId = Interlocked.Increment(ref nextId); if (newId > byte.MaxValue) { throw new LiftiException(ExceptionMessages.MaximumDistinctFieldsIndexReached); } var id = (byte)newId; var fieldTokenizationOptions = fieldOptions.TokenizationOptions ?? defaultTokenizationOptions; this.fieldToDetailsLookup[fieldName] = new IndexedFieldDetails((byte)id, tokenizerFactory.Create(fieldTokenizationOptions)); this.idToFieldLookup[id] = fieldName; }
public TokenizerAnnotator(bool verbose, Properties props, string options) { if (props == null) { props = new Properties(); } // check if segmenting must be done if (props.GetProperty("tokenize.language") != null && LanguageInfo.IsSegmenterLanguage(props.GetProperty("tokenize.language"))) { useSegmenter = true; if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Arabic) { segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props); } else { if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Chinese) { segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props); } else { segmenterAnnotator = null; throw new Exception("No segmenter implemented for: " + LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language"))); } } } else { useSegmenter = false; segmenterAnnotator = null; } Verbose = PropertiesUtils.GetBool(props, "tokenize.verbose", verbose); TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.GetTokenizerType(props); factory = InitFactory(type, props, options); }
internal FullTextIndex( IndexOptions indexOptions, ConfiguredItemTokenizationOptions <TKey> itemTokenizationOptions, IIndexNodeFactory indexNodeFactory, ITokenizerFactory tokenizerFactory, IQueryParser queryParser, TokenizationOptions defaultTokenizationOptions, Func <IIndexSnapshot <TKey>, Task>[]?indexModifiedActions) { this.indexOptions = indexOptions; this.itemTokenizationOptions = itemTokenizationOptions ?? throw new ArgumentNullException(nameof(itemTokenizationOptions)); this.IndexNodeFactory = indexNodeFactory ?? throw new ArgumentNullException(nameof(indexNodeFactory)); this.tokenizerFactory = tokenizerFactory ?? throw new ArgumentNullException(nameof(tokenizerFactory)); this.queryParser = queryParser ?? throw new ArgumentNullException(nameof(queryParser)); this.defaultTokenizationOptions = defaultTokenizationOptions ?? throw new ArgumentNullException(nameof(defaultTokenizationOptions)); this.indexModifiedActions = indexModifiedActions; this.idPool = new IdPool <TKey>(); this.FieldLookup = new IndexedFieldLookup( this.itemTokenizationOptions.GetAllConfiguredFields(), tokenizerFactory, defaultTokenizationOptions); this.Root = this.IndexNodeFactory.CreateRootNode(); }
/// <summary>Sets the tokenizer used to chop up text into words for the documents.</summary> public virtual void SetTokenizerFactory <_T0>(ITokenizerFactory <_T0> tokenizerFactory) where _T0 : IHasWord { this.tokenizerFactory = tokenizerFactory; }
/// <param name="hasSegMarkers">if true, input has segmentation markers</param> /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param> /// <param name="tokFactory">a TokenizerFactory for the input</param> public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, ITokenizerFactory <CoreLabel> tokFactory) : this(hasSegMarkers, hasTags, false, "123", tokFactory) { }
/// <param name="hasSegMarkers">if true, input has segmentation markers</param> /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param> /// <param name="hasDomainLabels"> /// if true, input has a whitespace-terminated domain at the beginning /// of each line of text /// </param> /// <param name="tokFactory">a TokenizerFactory for the input</param> public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, bool hasDomainLabels, string domain, ITokenizerFactory <CoreLabel> tokFactory) : this(hasSegMarkers, hasTags, hasDomainLabels, domain, false, tokFactory) { }
/// <summary> /// This method lets you train and test a segmenter relative to a /// Treebank. /// </summary> /// <remarks> /// This method lets you train and test a segmenter relative to a /// Treebank. /// <p> /// <i>Implementation note:</i> This method is largely cloned from /// LexicalizedParser's main method. Should we try to have it be able /// to train segmenters to stop things going out of sync? /// </remarks> public static void Main(string[] args) { bool train = false; bool saveToSerializedFile = false; bool saveToTextFile = false; string serializedInputFileOrUrl = null; string textInputFileOrUrl = null; string serializedOutputFileOrUrl = null; string textOutputFileOrUrl = null; string treebankPath = null; Treebank testTreebank = null; // Treebank tuneTreebank = null; string testPath = null; IFileFilter testFilter = null; IFileFilter trainFilter = null; string encoding = null; // variables needed to process the files to be parsed ITokenizerFactory <Word> tokenizerFactory = null; // DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(); bool tokenized = false; // whether or not the input file has already been tokenized IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper(); // int tagDelimiter = -1; // String sentenceDelimiter = "\n"; // boolean fromXML = false; int argIndex = 0; if (args.Length < 1) { log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); op.tlpParams = new ChineseTreebankParserParams(); // while loop through option arguments while (argIndex < args.Length && args[argIndex][0] == '-') { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train")) { train = true; saveToSerializedFile = true; int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs > 1) { treebankPath = args[argIndex]; argIndex++; } else { throw new Exception("Error: -train option must have treebankPath as first argument."); } if (numSubArgs == 2) { trainFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding")) { // sets encoding for TreebankLangParserParams encoding = args[argIndex + 1]; op.tlpParams.SetInputEncoding(encoding); op.tlpParams.SetOutputEncoding(encoding); argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { // doesn't make sense to load from TextFile -pichuan // } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // // load the parser from declarative text file // // the next argument must be the path to the parser file // textInputFileOrUrl = args[argIndex + 1]; // argIndex += 2; if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile")) { saveToSerializedFile = true; serializedOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { // the next argument is the treebank path and range for testing int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs == 1) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs > 1) { testPath = args[argIndex++]; if (numSubArgs == 2) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); testFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? testFilter = new NumberRangesFileFilter(args[argIndex++], true); } } } } } } else { int j = op.tlpParams.SetOptionFlag(args, argIndex); if (j == argIndex) { log.Info("Unknown option ignored: " + args[argIndex]); j++; } argIndex = j; } } } } } } } // end while loop through arguments ITreebankLangParserParams tlpParams = op.tlpParams; // all other arguments are order dependent and // are processed in order below Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null; if (!train && op.testOptions.verbose) { System.Console.Out.WriteLine("Currently " + new DateTime()); PrintArgs(args, System.Console.Out); } if (train) { PrintArgs(args, System.Console.Out); // so we train a parser using the treebank if (treebankPath == null) { // the next arg must be the treebank path, since it wasn't give earlier treebankPath = args[argIndex]; argIndex++; if (args.Length > argIndex + 1) { try { // the next two args might be the range int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } Treebank trainTreebank = MakeTreebank(treebankPath, op, trainFilter); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex); } else { if (textInputFileOrUrl != null) { } else { // so we load the segmenter from a text grammar file // XXXXX fix later -pichuan //cs = new LexicalizedParser(textInputFileOrUrl, true, op); // so we load a serialized segmenter if (serializedInputFileOrUrl == null) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } try { cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op); } catch (ArgumentException) { log.Info("Error loading segmenter, exiting..."); System.Environment.Exit(0); } } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test TreePrint treePrint = op.testOptions.TreePrint(tlpParams); if (testFilter != null) { if (testPath == null) { if (treebankPath == null) { throw new Exception("No test treebank path specified..."); } else { log.Info("No test treebank path specified. Using train path: \"" + treebankPath + "\""); testPath = treebankPath; } } testTreebank = tlpParams.TestMemoryTreebank(); testTreebank.LoadPath(testPath, testFilter); } op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. We also set the tlpParams of the // LexicalizedParser instance to be the same object. This is // redundancy that we probably should take out eventually. // // -- Roger if (op.testOptions.verbose) { log.Info("Lexicon is " + cs.GetType().FullName); } PrintWriter pwOut = tlpParams.Pw(); PrintWriter pwErr = tlpParams.Pw(System.Console.Error); // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { SaveSegmenterDataToText(cs, textOutputFileOrUrl); } else { log.Info("Usage: must specify a text segmenter data output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl == null && argIndex < args.Length) { // the next argument must be the path to serialize to serializedOutputFileOrUrl = args[argIndex]; argIndex++; } if (serializedOutputFileOrUrl != null) { SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl); } else { if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename"); } } } /* --------------------- Testing part!!!! ----------------------- */ if (op.testOptions.verbose) { } // printOptions(false, op); if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))) { // test parser on treebank if (testTreebank == null) { // the next argument is the treebank path and range for testing testTreebank = tlpParams.TestMemoryTreebank(); if (args.Length < argIndex + 4) { testTreebank.LoadPath(args[argIndex + 1]); } else { int testlow = System.Convert.ToInt32(args[argIndex + 2]); int testhigh = System.Convert.ToInt32(args[argIndex + 3]); testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true)); } } } }
public TextProcessorFactory(ISentenceDetectorFactory sdFact, ITokenizerFactory tokFact, IStemmerFactory stemFact) { this.sdFact = sdFact; this.tokFact = tokFact; this.stemFact = stemFact; }