/// <summary> /// Creates a new <see cref="CJKBigramFilterFactory"/> </summary> public CJKBigramFilterFactory(IDictionary <string, string> args) : base(args) { CJKScript flags = 0; if (GetBoolean(args, "han", true)) { flags |= CJKScript.HAN; } if (GetBoolean(args, "hiragana", true)) { flags |= CJKScript.HIRAGANA; } if (GetBoolean(args, "katakana", true)) { flags |= CJKScript.KATAKANA; } if (GetBoolean(args, "hangul", true)) { flags |= CJKScript.HANGUL; } this.flags = flags; this.outputUnigrams = GetBoolean(args, "outputUnigrams", false); if (args.Count > 0) { throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args)); } }
/// <summary> /// Creates a new <see cref="CJKBigramFilterFactory"/> </summary> public CJKBigramFilterFactory(IDictionary <string, string> args) : base(args) { CJKScript flags = 0; if (GetBoolean(args, "han", true)) { flags |= CJKScript.HAN; } if (GetBoolean(args, "hiragana", true)) { flags |= CJKScript.HIRAGANA; } if (GetBoolean(args, "katakana", true)) { flags |= CJKScript.KATAKANA; } if (GetBoolean(args, "hangul", true)) { flags |= CJKScript.HANGUL; } this.flags = flags; this.outputUnigrams = GetBoolean(args, "outputUnigrams", false); if (args.Count > 0) { throw new System.ArgumentException("Unknown parameters: " + args); } }
/// <summary> /// Create a new <see cref="CJKBigramFilter"/>, specifying which writing systems should be bigrammed, /// and whether or not unigrams should also be output. </summary> /// <param name="in"> /// Input <see cref="TokenStream"/> </param> /// <param name="flags"> OR'ed set from <see cref="CJKScript.HAN"/>, <see cref="CJKScript.HIRAGANA"/>, /// <see cref="CJKScript.KATAKANA"/>, <see cref="CJKScript.HANGUL"/> </param> /// <param name="outputUnigrams"> true if unigrams for the selected writing systems should also be output. /// when this is false, this is only done when there are no adjacent characters to form /// a bigram. </param> public CJKBigramFilter(TokenStream @in, CJKScript flags, bool outputUnigrams) : base(@in) { doHan = (flags & CJKScript.HAN) == 0 ? NO : HAN_TYPE; doHiragana = (flags & CJKScript.HIRAGANA) == 0 ? NO : HIRAGANA_TYPE; doKatakana = (flags & CJKScript.KATAKANA) == 0 ? NO : KATAKANA_TYPE; doHangul = (flags & CJKScript.HANGUL) == 0 ? NO : HANGUL_TYPE; this.outputUnigrams = outputUnigrams; this.termAtt = AddAttribute <ICharTermAttribute>(); this.typeAtt = AddAttribute <ITypeAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.posIncAtt = AddAttribute <IPositionIncrementAttribute>(); this.posLengthAtt = AddAttribute <IPositionLengthAttribute>(); }
/// <summary> /// Calls <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, CJKScript, bool)"> /// CJKBigramFilter(in, flags, false)</see> /// </summary> /// <param name="in"> /// Input <see cref="TokenStream"/> </param> /// <param name="flags"> OR'ed set from <see cref="CJKScript.HAN"/>, <see cref="CJKScript.HIRAGANA"/>, /// <see cref="CJKScript.KATAKANA"/>, <see cref="CJKScript.HANGUL"/> </param> public CJKBigramFilter(TokenStream @in, CJKScript flags) : this(@in, flags, false) { }