Ejemplo n.º 1
0
        /// <summary>Records a replacement to be applied to the inputs
        /// stream.  Whenever <code>singleMatch</code> occurs in
        /// the input, it will be replaced with
        /// <code>replacement</code>.
        ///
        /// </summary>
        /// <param name="singleMatch">input String to be replaced
        /// </param>
        /// <param name="replacement">output String
        /// </param>
        public virtual void  Add(System.String singleMatch, System.String replacement)
        {
            NormalizeCharMap currMap = this;

            for (int i = 0; i < singleMatch.Length; i++)
            {
                char c = singleMatch[i];
                if (currMap.submap == null)
                {
                    currMap.submap = new System.Collections.Hashtable(1);
                }
                NormalizeCharMap map = (NormalizeCharMap)currMap.submap[CharacterCache.ValueOf(c)];
                if (map == null)
                {
                    map = new NormalizeCharMap();
                    currMap.submap[c] = map;
                }
                currMap = map;
            }
            if (currMap.normStr != null)
            {
                throw new System.SystemException("MappingCharFilter: there is already a mapping for " + singleMatch);
            }
            currMap.normStr = replacement;
            currMap.diff    = singleMatch.Length - replacement.Length;
        }
	  // TODO: this should use inputstreams from the loader, not File!
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
	  public virtual void inform(ResourceLoader loader)
	  {
		if (mapping != null)
		{
		  IList<string> wlist = null;
		  File mappingFile = new File(mapping);
		  if (mappingFile.exists())
		  {
			wlist = getLines(loader, mapping);
		  }
		  else
		  {
			IList<string> files = splitFileNames(mapping);
			wlist = new List<>();
			foreach (string file in files)
			{
			  IList<string> lines = getLines(loader, file.Trim());
			  wlist.AddRange(lines);
			}
		  }
		  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
		  parseRules(wlist, builder);
		  normMap = builder.build();
		  if (normMap.map == null)
		  {
			// if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
			// so just set the whole map to null
			normMap = null;
		  }
		}
	  }
Ejemplo n.º 3
0
        // TODO: this should use inputstreams from the loader, not File!
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
        public virtual void inform(ResourceLoader loader)
        {
            if (mapping != null)
            {
                IList <string> wlist       = null;
                File           mappingFile = new File(mapping);
                if (mappingFile.exists())
                {
                    wlist = getLines(loader, mapping);
                }
                else
                {
                    IList <string> files = splitFileNames(mapping);
                    wlist = new List <>();
                    foreach (string file in files)
                    {
                        IList <string> lines = getLines(loader, file.Trim());
                        wlist.AddRange(lines);
                    }
                }
                NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
                parseRules(wlist, builder);
                normMap = builder.build();
                if (normMap.map == null)
                {
                    // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
                    // so just set the whole map to null
                    normMap = null;
                }
            }
        }
Ejemplo n.º 4
0
        private NormalizeCharMap Match(NormalizeCharMap map)
        {
            NormalizeCharMap result = null;

            if (map.submap != null)
            {
                int chr = NextChar();
                if (chr != -1)
                {
                    NormalizeCharMap subMap = (NormalizeCharMap)map.submap[CharacterCache.ValueOf((char)chr)];
                    if (subMap != null)
                    {
                        result = Match(subMap);
                    }
                    if (result == null)
                    {
                        PushChar(chr);
                    }
                }
            }
            if (result == null && map.normStr != null)
            {
                result = map;
            }
            return(result);
        }
Ejemplo n.º 5
0
        public virtual void Test()
        {
            CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false);

            cas.add("jjp");
            cas.add("wlmwoknt");
            cas.add("tcgyreo");

            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("mtqlpi", "");
            builder.Add("mwoknt", "jjp");
            builder.Add("tcgyreo", "zpfpajyws");
            NormalizeCharMap map = builder.Build();

            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer t   = new MockTokenizer(new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader), MockTokenFilter.ENGLISH_STOPSET, false, -65);
                TokenFilter f = new CommonGramsFilter(TEST_VERSION_CURRENT, t, cas);
                return(new TokenStreamComponents(t, f));
            }, initReader: (fieldName, reader) =>
            {
                reader = new MockCharFilter(reader, 0);
                reader = new MappingCharFilter(map, reader);
                return(reader);
            });

            CheckAnalysisConsistency(Random, a, false, "wmgddzunizdomqyj");
        }
Ejemplo n.º 6
0
        /// <summary>
        /// http://stackoverflow.com/questions/15235126/lucene-4-1-how-split-words-that-contains-dots-when-indexing
        /// </summary>
        /// <param name="reader"></param>
        /// <returns></returns>
        private TextReader InitReader(TextReader reader)
        {
            NormalizeCharMap normalizeCharMap = new NormalizeCharMap();

            foreach (string normalizeChar in this._NormalizeChars)
            {
                normalizeCharMap.Add(normalizeChar, " ");
            }
            return(new MappingCharFilter(normalizeCharMap, reader));
        }
Ejemplo n.º 7
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testNormalizeWinDelimToLinuxDelim() throws Exception
        public virtual void testNormalizeWinDelimToLinuxDelim()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.add("\\", "/");
            NormalizeCharMap       normMap = builder.build();
            string                 path    = "c:\\a\\b\\c";
            Reader                 cs      = new MappingCharFilter(normMap, new StringReader(path));
            PathHierarchyTokenizer t       = new PathHierarchyTokenizer(cs);

            assertTokenStreamContents(t, new string[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.Length);
        }
        public virtual void TestInvalidOffsets()
        {
            CharArraySet dict = makeDictionary("fall");

            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("ü", "ue");
            NormalizeCharMap normMap = builder.Build();

            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, dict, normMap);

            AssertAnalyzesTo(analyzer, "banküberfall", new string[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 });
        }
Ejemplo n.º 9
0
        public virtual void TestChangedOffsets()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("a", "一二");
            builder.Add("b", "二三");
            NormalizeCharMap norm     = builder.Build();
            Analyzer         analyzer = new AnalyzerAnonymousInnerClassHelper(this, norm);

            AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });

            // note: offsets are strange since this is how the charfilter maps them...
            // before bigramming, the 4 tokens look like:
            //   { 0, 0, 1, 1 },
            //   { 0, 1, 1, 2 }
        }
Ejemplo n.º 10
0
	  /// <summary>
	  /// Default constructor that takes a <seealso cref="Reader"/>. </summary>
	  public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in)
	  {
		buffer.Reset(@in);

		map = normMap.map;
		cachedRootArcs = normMap.cachedRootArcs;

		if (map != null)
		{
		  fstReader = map.BytesReader;
		}
		else
		{
		  fstReader = null;
		}
	  }
Ejemplo n.º 11
0
        /// <summary>
        /// Default constructor that takes a <seealso cref="Reader"/>. </summary>
        public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in)
        {
            buffer.Reset(@in);

            map            = normMap.map;
            cachedRootArcs = normMap.cachedRootArcs;

            if (map != null)
            {
                fstReader = map.BytesReader;
            }
            else
            {
                fstReader = null;
            }
        }
Ejemplo n.º 12
0
        public virtual void Test()
        {
            CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false);

            cas.add("jjp");
            cas.add("wlmwoknt");
            cas.add("tcgyreo");

            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("mtqlpi", "");
            builder.Add("mwoknt", "jjp");
            builder.Add("tcgyreo", "zpfpajyws");
            NormalizeCharMap map = builder.Build();

            Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, cas, map);

            CheckAnalysisConsistency(Random(), a, false, "wmgddzunizdomqyj");
        }
Ejemplo n.º 13
0
        public virtual void TestChangedOffsets()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("a", "一二");
            builder.Add("b", "二三");
            NormalizeCharMap norm     = builder.Build();
            Analyzer         analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
                return(new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer)));
            }, initReader: (fieldName, reader) => new MappingCharFilter(norm, reader));

            AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });

            // note: offsets are strange since this is how the charfilter maps them...
            // before bigramming, the 4 tokens look like:
            //   { 0, 0, 1, 1 },
            //   { 0, 1, 1, 2 }
        }
Ejemplo n.º 14
0
        /// <summary>
        /// test that offsets are correct when mappingcharfilter is previously applied </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testChangedOffsets() throws java.io.IOException
        public virtual void testChangedOffsets()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder builder = new org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder();
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.add("a", "一二");
            builder.add("b", "二三");
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap norm = builder.build();
            NormalizeCharMap norm     = builder.build();
            Analyzer         analyzer = new AnalyzerAnonymousInnerClassHelper(this, norm);

            assertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });

            // note: offsets are strange since this is how the charfilter maps them...
            // before bigramming, the 4 tokens look like:
            //   { 0, 0, 1, 1 },
            //   { 0, 1, 1, 2 }
        }
Ejemplo n.º 15
0
        public override int Read()
        {
            while (true)
            {
                if (replacement != null && charPointer < replacement.Length)
                {
                    return(replacement[charPointer++]);
                }

                int firstChar = NextChar();
                if (firstChar == -1)
                {
                    return(-1);
                }
                NormalizeCharMap nm = normMap.submap != null?(NormalizeCharMap)normMap.submap[CharacterCache.ValueOf((char)firstChar)]:null;
                if (nm == null)
                {
                    return(firstChar);
                }
                NormalizeCharMap result = Match(nm);
                if (result == null)
                {
                    return(firstChar);
                }
                replacement = result.normStr;
                charPointer = 0;
                if (result.diff != 0)
                {
                    int prevCumulativeDiff = GetLastCumulativeDiff();
                    if (result.diff < 0)
                    {
                        for (int i = 0; i < -result.diff; i++)
                        {
                            AddOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i);
                        }
                    }
                    else
                    {
                        AddOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff);
                    }
                }
            }
        }
        protected override TextReader InitReader(string fieldName, TextReader reader)
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            // different apostrophes
            builder.Add("\u2019", "'");
            builder.Add("\u2018", "'");
            builder.Add("\u02BC", "'");
            builder.Add("`", "'");
            builder.Add("´", "'");
            // ignored characters
            builder.Add("\u0301", "");
            builder.Add("\u00AD", "");
            builder.Add("ґ", "г");
            builder.Add("Ґ", "Г");

            NormalizeCharMap normMap = builder.Build();

            reader = new MappingCharFilter(normMap, reader);
            return(reader);
        }
Ejemplo n.º 17
0
        public virtual void TestOffsetCorrection()
        {
            const string INPUT = "G&uuml;nther G&uuml;nther is here";

            // create MappingCharFilter
            IList <string> mappingRules = new JCG.List <string>();

            mappingRules.Add("\"&uuml;\" => \"ü\"");
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("&uuml;", "ü");
            NormalizeCharMap normMap    = builder.Build();
            CharFilter       charStream = new MappingCharFilter(normMap, new StringReader(INPUT));

            // create PatternTokenizer
            TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1);

            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length);

            charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
            stream     = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length);
        }
Ejemplo n.º 18
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void test() throws Exception
        public virtual void test()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.util.CharArraySet cas = new org.apache.lucene.analysis.util.CharArraySet(TEST_VERSION_CURRENT, 3, false);
            CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false);

            cas.add("jjp");
            cas.add("wlmwoknt");
            cas.add("tcgyreo");

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder builder = new org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder();
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.add("mtqlpi", "");
            builder.add("mwoknt", "jjp");
            builder.add("tcgyreo", "zpfpajyws");
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap map = builder.build();
            NormalizeCharMap map = builder.build();

            Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, cas, map);

            checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
        }
Ejemplo n.º 19
0
 /// Easy-use constructor that takes a {@link Reader}.
 public MappingCharFilter(NormalizeCharMap normMap, System.IO.TextReader in_Renamed) : base(CharReader.Get(in_Renamed))
 {
     this.normMap = normMap;
 }
Ejemplo n.º 20
0
 public AnalyzerAnonymousInnerClassHelper(TestBugInSomething outerInstance, CharArraySet cas, NormalizeCharMap map)
 {
     this.outerInstance = outerInstance;
     this.cas           = cas;
     this.map           = map;
 }
 public AnalyzerAnonymousInnerClassHelper(TestCompoundWordTokenFilter outerInstance, CharArraySet dict, NormalizeCharMap normMap)
 {
     this.outerInstance = outerInstance;
     this.dict          = dict;
     this.normMap       = normMap;
 }
	  protected internal virtual void parseRules(IList<string> rules, NormalizeCharMap.Builder builder)
	  {
		foreach (string rule in rules)
		{
		  Matcher m = p.matcher(rule);
		  if (!m.find())
		  {
			throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
		  }
		  builder.add(parseString(m.group(1)), parseString(m.group(2)));
		}
	  }
Ejemplo n.º 23
0
 public AnalyzerAnonymousInnerClassHelper(TestCJKAnalyzer outerInstance, NormalizeCharMap norm)
 {
     this.outerInstance = outerInstance;
     this.norm          = norm;
 }
Ejemplo n.º 24
0
 /// Default constructor that takes a {@link CharStream}.
 public MappingCharFilter(NormalizeCharMap normMap, CharStream in_Renamed) : base(in_Renamed)
 {
     this.normMap = normMap;
 }