/// <summary> /// Construct a new <see cref="ICUTokenizer"/> that breaks text into words from the given /// <see cref="TextReader"/>, using a tailored <see cref="BreakIterator"/> configuration. /// </summary> /// <param name="factory"><see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use.</param> /// <param name="input"><see cref="TextReader"/> containing text to tokenize.</param> /// <param name="config">Tailored <see cref="BreakIterator"/> configuration.</param> public ICUTokenizer(AttributeFactory factory, TextReader input, ICUTokenizerConfig config) : base(factory, input) { this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.termAtt = AddAttribute <ICharTermAttribute>(); this.typeAtt = AddAttribute <ITypeAttribute>(); this.scriptAtt = AddAttribute <IScriptAttribute>(); this.config = config; breaker = new CompositeBreakIterator(config); }
public virtual void Inform(IResourceLoader loader) { Debug.Assert(tailored != null, "init must be called first!"); if (tailored.Count == 0) { config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords); } else { config = new DefaultICUTokenizerConfigAnonymousHelper(cjkAsWords, myanmarAsWords, tailored, loader); } }
/// <summary> /// Construct a new <see cref="ICUTokenizer"/> that breaks text into words from the given /// <see cref="TextReader"/>, using a tailored <see cref="BreakIterator"/> configuration. /// </summary> /// <remarks> /// The default attribute factory is used. /// </remarks> /// <param name="input"><see cref="TextReader"/> containing text to tokenize.</param> /// <param name="config">Tailored <see cref="BreakIterator"/> configuration.</param> public ICUTokenizer(TextReader input, ICUTokenizerConfig config) : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, config) { }
public CompositeBreakIterator(ICUTokenizerConfig config) { this.config = config; this.scriptIterator = new ScriptIterator(config.CombineCJ); }