示例#1
0
        private static List /*<String>*/ splitByTokenizer(string source, TokenizerFactory tokFactory)
        {
            StringReader      reader  = new StringReader(source);
            TokenStream       ts      = loadTokenizer(tokFactory, reader);
            List /*<String>*/ tokList = new ArrayList/*<String>*/ ();

            try {
#pragma warning disable 612
                for (Token token = ts.next(); token != null; token = ts.next())
                {
#pragma warning restore 612
                    string text = new string(token.termBuffer(), 0, token.termLength());
                    if (text.Length > 0)
                    {
                        tokList.add(text);
                    }
                }
            } catch (IOException e) {
                throw new System.ApplicationException("Unexpected exception.", e);
            }
            finally{
                reader.close();
            }
            return(tokList);
        }
示例#2
0
        public static void parseRules(List /*<String>*/ rules, SynonymMap map, string mappingSep,
                                      string synSep, bool expansion, TokenizerFactory tokFactory)
        {
            int count = 0;

            for (var iter = rules.iterator(); iter.hasNext();)
            {
                // To use regexes, we need an expression that specifies an odd number of chars.
                // This can't really be done with string.split(), and since we need to
                // do unescaping at some point anyway, we wouldn't be saving any effort
                // by using regexes.

                string            rule    = (string)iter.next();
                List /*<String>*/ mapping = StrUtils.splitSmart(rule, mappingSep, false);

                List /*<List<String>>*/ source;
                List /*<List<String>>*/ target;

                if (mapping.size() > 2)
                {
                    throw new System.ApplicationException("Invalid Synonym Rule:" + rule);
                }
                else if (mapping.size() == 2)
                {
                    source = getSynList((string)mapping.get(0), synSep, tokFactory);
                    target = getSynList((string)mapping.get(1), synSep, tokFactory);
                }
                else
                {
                    source = getSynList((string)mapping.get(0), synSep, tokFactory);
                    if (expansion)
                    {
                        // expand to all arguments
                        target = source;
                    }
                    else
                    {
                        // reduce to first argument
                        target = new ArrayList/*<List<String>>*/ (1);
                        target.add(source.get(0));
                    }
                }

                bool includeOrig = false;
                for (var fromIter = source.iterator(); fromIter.hasNext();)
                {
                    List /*<String>*/ fromToks = (List)fromIter.next();
                    count++;
                    for (var toIter = target.iterator(); toIter.hasNext();)
                    {
                        List /*<String>*/ toToks = (List)toIter.next();
                        map.add(fromToks,
                                SynonymMap.makeTokens(toToks),
                                includeOrig,
                                true
                                );
                    }
                }
            }
        }
示例#3
0
        /** Splits a backslash escaped string on the separator.
         * <p>
         * Current backslash escaping supported:
         * <br> \n \t \r \b \f are escaped the same as a Java String
         * <br> Other characters following a backslash are produced verbatim (\c => c)
         *
         * @param s  the string to split
         * @param separator the separator to split on
         * @param decode decode backslash escaping
         */
        public static List /*<String>*/ splitSmart(string s, string separator, bool decode)
        {
            ArrayList /*<String>*/ lst = new ArrayList/*<String>*/ (2);

            java.lang.StringBuilder sb = new java.lang.StringBuilder();
            int pos = 0, end = s.Length;

            while (pos < end)
            {
                if (java.lang.String.instancehelper_startsWith(s, separator, pos))
                {
                    if (sb.length() > 0)
                    {
                        lst.add(sb.toString());
                        sb = new java.lang.StringBuilder();
                    }
                    pos += separator.Length;
                    continue;
                }

                char ch = s[pos++];
                if (ch == '\\')
                {
                    if (!decode)
                    {
                        sb.append(ch);
                    }
                    if (pos >= end)
                    {
                        break;  // ERROR, or let it go?
                    }
                    ch = s[pos++];
                    if (decode)
                    {
                        switch (ch)
                        {
                        case 'n': ch = '\n'; break;

                        case 't': ch = '\t'; break;

                        case 'r': ch = '\r'; break;

                        case 'b': ch = '\b'; break;

                        case 'f': ch = '\f'; break;
                        }
                    }
                }

                sb.append(ch);
            }

            if (sb.length() > 0)
            {
                lst.add(sb.toString());
            }

            return(lst);
        }
示例#4
0
        public static List /*<String>*/ splitWS(string s, bool decode)
        {
            ArrayList /*<String>*/ lst = new ArrayList/*<String>*/ (2);

            java.lang.StringBuilder sb = new java.lang.StringBuilder();
            int pos = 0, end = s.Length;

            while (pos < end)
            {
                char ch = s[pos++];
                if (java.lang.Character.isWhitespace(ch))
                {
                    if (sb.length() > 0)
                    {
                        lst.add(sb.toString());
                        sb = new java.lang.StringBuilder();
                    }
                    continue;
                }

                if (ch == '\\')
                {
                    if (!decode)
                    {
                        sb.append(ch);
                    }
                    if (pos >= end)
                    {
                        break;  // ERROR, or let it go?
                    }
                    ch = s[pos++];
                    if (decode)
                    {
                        switch (ch)
                        {
                        case 'n': ch = '\n'; break;

                        case 't': ch = '\t'; break;

                        case 'r': ch = '\r'; break;

                        case 'b': ch = '\b'; break;

                        case 'f': ch = '\f'; break;
                        }
                    }
                }

                sb.append(ch);
            }

            if (sb.length() > 0)
            {
                lst.add(sb.toString());
            }

            return(lst);
        }
示例#5
0
        /***
         * Return a list of tokens according to a test string format:
         * a b c  =>  returns List<Token> [a,b,c]
         * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
         * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
         * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
         */
        public List /*<Token>*/ tokens(string str)
        {
            string[]         arr    = str.Split(' ');
            List /*<Token>*/ result = new ArrayList/*<Token>*/ ();

            for (int i = 0; i < arr.Length; i++)
            {
                string[] toks    = arr[i].Split('/');
                string[] @params = toks[0].Split(',');

                int posInc;
                int start;
                int end;

                if (@params.Length > 1)
                {
                    posInc = java.lang.Integer.parseInt(@params[1]);
                }
                else
                {
                    posInc = 1;
                }

                if (@params.Length > 2)
                {
                    start = java.lang.Integer.parseInt(@params[2]);
                }
                else
                {
                    start = 0;
                }

                if (@params.Length > 3)
                {
                    end = java.lang.Integer.parseInt(@params[3]);
                }
                else
                {
                    end = start + @params[0].Length;
                }

                Token t = new Token(@params[0], start, end, "TEST");
                t.setPositionIncrement(posInc);

                result.add(t);
                for (int j = 1; j < toks.Length; j++)
                {
                    t = new Token(toks[j], 0, 0, "TEST");
                    t.setPositionIncrement(0);
                    result.add(t);
                }
            }
            return(result);
        }
示例#6
0
        /**
         * <p>
         * This method is converting the independent LinkHashMaps containing various
         * (silo'ed) suggestions for each mis-spelled word into individual
         * "holistic query corrections", aka. "Spell Check Possibility"
         * </p>
         * <p>
         * Rank here is the sum of each selected term's position in its respective
         * LinkedHashMap.
         * </p>
         *
         * @return
         */
        private RankedSpellPossibility internalNext()
        {
            if (done)
            {
                throw new NoSuchElementException();
            }

            List /*<SpellCheckCorrection>*/ possibleCorrection = new ArrayList/*<SpellCheckCorrection>*/ ();
            int rank = 0;

            for (int i = 0; i < correctionIndex.Length; i++)
            {
                List /*<SpellCheckCorrection>*/ singleWordPossibilities = (List)possibilityList.get(i);
                SpellCheckCorrection            singleWordPossibility   = (SpellCheckCorrection)singleWordPossibilities.get(correctionIndex[i]);
                rank += correctionIndex[i];

                if (i == correctionIndex.Length - 1)
                {
                    correctionIndex[i]++;
                    if (correctionIndex[i] == singleWordPossibilities.size())
                    {
                        correctionIndex[i] = 0;
                        if (correctionIndex.Length == 1)
                        {
                            done = true;
                        }
                        for (int ii = i - 1; ii >= 0; ii--)
                        {
                            correctionIndex[ii]++;
                            if (correctionIndex[ii] >= ((List)possibilityList.get(ii)).size() && ii > 0)
                            {
                                correctionIndex[ii] = 0;
                            }
                            else
                            {
                                break;
                            }
                        }
                    }
                }
                possibleCorrection.add(singleWordPossibility);
            }

            if (correctionIndex[0] == ((List)possibilityList.get(0)).size())
            {
                done = true;
            }

            RankedSpellPossibility rsl = new RankedSpellPossibility();

            rsl.setCorrections(possibleCorrection);
            rsl.setRank(rank);
            return(rsl);
        }
示例#7
0
        public List /*<String>*/ tok2str(java.lang.Iterable /*<Token>*/ tokLst)
        {
            ArrayList /*<String>*/ lst = new ArrayList/*<String>*/ ();

            for (var iter = tokLst.iterator(); iter.hasNext();)
            {
                Token t = (Token)iter.next();
                lst.add(new string(t.termBuffer(), 0, t.termLength()));
            }
            return(lst);
        }
示例#8
0
            public IterTokenStream(params string[] text)
            {
                int off = 0;
                ArrayList /*<Token>*/ t = new ArrayList/*<Token>*/ (text.Length);

                foreach (string txt in text)
                {
                    t.add(new Token(txt, off, off + txt.Length));
                    off += txt.Length + 2;
                }
                this.toks = t.iterator();
            }
示例#9
0
        public void testRead1waySynonymRules()
        {
            SynonymMap synMap;

            // (a)->[a]
            // (b)->[a]
            List /*<String>*/ rules = new ArrayList/*<String>*/ ();

            rules.add("a,b");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", false, null);
            Assert.AreEqual(2, synMap.submap.size());
            assertTokIncludes(synMap, "a", "a");
            assertTokIncludes(synMap, "b", "a");

            // (a)->[a]
            // (b)->[a]
            // (c)->[a]
            rules.clear();
            rules.add("a,b,c");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", false, null);
            Assert.AreEqual(3, synMap.submap.size());
            assertTokIncludes(synMap, "a", "a");
            assertTokIncludes(synMap, "b", "a");
            assertTokIncludes(synMap, "c", "a");

            // (a)->[a]
            // (b1)->(b2)->[a]
            rules.clear();
            rules.add("a,b1 b2");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", false, null);
            Assert.AreEqual(2, synMap.submap.size());
            assertTokIncludes(synMap, "a", "a");
            Assert.AreEqual(1, getSubSynonymMap(synMap, "b1").submap.size());
            assertTokIncludes(getSubSynonymMap(synMap, "b1"), "b2", "a");

            // (a1)->(a2)->[a1][a2]
            // (b)->[a1][a2]
            rules.clear();
            rules.add("a1 a2,b");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", false, null);
            Assert.AreEqual(2, synMap.submap.size());
            Assert.AreEqual(1, getSubSynonymMap(synMap, "a1").submap.size());
            assertTokIncludes(getSubSynonymMap(synMap, "a1"), "a2", "a1");
            assertTokIncludes(getSubSynonymMap(synMap, "a1"), "a2", "a2");
            assertTokIncludes(synMap, "b", "a1");
            assertTokIncludes(synMap, "b", "a2");
        }
示例#10
0
        /**
         * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
         * the tokens end up at the same position.
         *
         * Example:  [a b] merged with [c d] produces [a/b c/d]  ('/' denotes tokens in the same position)
         * Example:  [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2]  (a,n means a has posInc=n)
         *
         */
        public static List /*<Token>*/ mergeTokens(List /*<Token>*/ lst1, List /*<Token>*/ lst2)
        {
            ArrayList /*<Token>*/ result = new ArrayList/*<Token>*/ ();

            if (lst1 == null || lst2 == null)
            {
                if (lst2 != null)
                {
                    result.addAll(lst2);
                }
                if (lst1 != null)
                {
                    result.addAll(lst1);
                }
                return(result);
            }

            int pos = 0;
            Iterator /*<Token>*/ iter1 = lst1.iterator();
            Iterator /*<Token>*/ iter2 = lst2.iterator();
            Token tok1 = (Token)(iter1.hasNext() ? iter1.next() : null);
            Token tok2 = (Token)(iter2.hasNext() ? iter2.next() : null);
            int   pos1 = tok1 != null?tok1.getPositionIncrement() : 0;

            int pos2 = tok2 != null?tok2.getPositionIncrement() : 0;

            while (tok1 != null || tok2 != null)
            {
                while (tok1 != null && (pos1 <= pos2 || tok2 == null))
                {
                    Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
                    tok.setTermBuffer(tok1.termBuffer(), 0, tok1.termLength());
                    tok.setPositionIncrement(pos1 - pos);
                    result.add(tok);
                    pos   = pos1;
                    tok1  = (Token)(iter1.hasNext() ? iter1.next() : null);
                    pos1 += tok1 != null?tok1.getPositionIncrement() : 0;
                }
                while (tok2 != null && (pos2 <= pos1 || tok1 == null))
                {
                    Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
                    tok.setTermBuffer(tok2.termBuffer(), 0, tok2.termLength());
                    tok.setPositionIncrement(pos2 - pos);
                    result.add(tok);
                    pos   = pos2;
                    tok2  = (Token)(iter2.hasNext() ? iter2.next() : null);
                    pos2 += tok2 != null?tok2.getPositionIncrement() : 0;
                }
            }
            return(result);
        }
示例#11
0
        /** Produces a List<Token> from a List<String> */
        public static List /*<Token>*/ makeTokens(List /*<String>*/ strings)
        {
            List /*<Token>*/ ret = new ArrayList/*<Token>*/ (strings.size());

            for (var iter = strings.iterator(); iter.hasNext();)
            {
                string str = (string)iter.next();
                //Token newTok = new Token(str,0,0,"SYNONYM");
                Token newTok = new Token(0, 0, "SYNONYM");
                newTok.setTermBuffer(str.ToCharArray(), 0, str.Length);
                ret.add(newTok);
            }
            return(ret);
        }
示例#12
0
        public void testInvalidMappingRules()
        {
            SynonymMap        synMap = new SynonymMap(true);
            List /*<String>*/ rules  = new ArrayList/*<String>*/ (1);

            rules.add("a=>b=>c");
            try{
                SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
                Assert.Fail("RuntimeException must be thrown.");
            }
#pragma warning disable 168
            catch (java.lang.RuntimeException expected) { }
#pragma warning restore 168
        }
示例#13
0
        /**
         * Split a string based on a separator, but don't split if it's inside
         * a string.  Assume '\' escapes the next char both inside and
         * outside strings.
         */
        public static List /*<String>*/ splitSmart(string s, char separator)
        {
            ArrayList /*<String>*/ lst = new ArrayList/*<String>*/ (4);
            int  pos = 0, start = 0, end = s.Length;
            char inString = (char)0;
            char ch = (char)0;

            while (pos < end)
            {
                char prevChar = ch;
                ch = s[pos++];
                if (ch == '\\') // skip escaped chars
                {
                    pos++;
                }
                else if (inString != 0 && ch == inString)
                {
                    inString = (char)0;
                }
                else if (ch == '\'' || ch == '"')
                {
                    // If char is directly preceeded by a number or letter
                    // then don't treat it as the start of a string.
                    // Examples: 50" TV, or can't
                    if (!java.lang.Character.isLetterOrDigit(prevChar))
                    {
                        inString = ch;
                    }
                }
                else if (ch == separator && inString == 0)
                {
                    lst.add(java.lang.String.instancehelper_substring(s, start, pos - 1));
                    start = pos;
                }
            }
            if (start < end)
            {
                lst.add(java.lang.String.instancehelper_substring(s, start, end));
            }

            /***
             * if (SolrCore.log.isLoggable(Level.FINEST)) {
             * SolrCore.log.trace("splitCommand=" + lst);
             * }
             ***/

            return(lst);
        }
示例#14
0
        /**
         * Splits file names separated by comma character.
         * File names can contain comma characters escaped by backslash '\'
         *
         * @param fileNames the string containing file names
         * @return a list of file names with the escaping backslashed removed
         */
        public static List /*<String>*/ splitFileNames(string fileNames)
        {
            if (fileNames == null)
            {
                return(java.util.Collections.emptyList());
            }

            List /*<String>*/ result = new ArrayList/*<String>*/ ();

            foreach (string file in fileNames.split("(?<!\\\\),"))
            {
                result.add(file.replaceAll("\\\\(?=,)", ""));
            }

            return(result);
        }
示例#15
0
        // a , b c , d e f => [[a],[b,c],[d,e,f]]
        private static List /*<List<String>>*/ getSynList(string str, string separator, TokenizerFactory tokFactory)
        {
            List /*<String>*/ strList = StrUtils.splitSmart(str, separator, false);
            // now split on whitespace to get a list of token strings
            List /*<List<String>>*/ synList = new ArrayList/*<List<String>>*/ ();

            for (var iter = strList.iterator(); iter.hasNext();)
            {
                string            toks    = (string)iter.next();
                List /*<String>*/ tokList = tokFactory == null?
                                            StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);

                synList.add(tokList);
            }
            return(synList);
        }
示例#16
0
        /**
         * <p>
         * We assume here that the passed-in inner LinkedHashMaps are already sorted
         * in order of "Best Possible Correction".
         * </p>
         *
         * @param suggestions
         */
        public PossibilityIterator(Map /*<Token, LinkedHashMap<String, Integer>>*/ suggestions)
        {
            for (var iter = suggestions.entrySet().iterator(); iter.hasNext();)
            {
                Map.Entry /*<Token, LinkedHashMap<String, Integer>>*/ entry = (Map.Entry)iter.next();
                Token token = (Token)entry.getKey();
                List /*<SpellCheckCorrection>*/ possibleCorrections = new ArrayList/*<SpellCheckCorrection>*/ ();
                for (var iter1 = ((LinkedHashMap)entry.getValue()).entrySet().iterator(); iter1.hasNext();)
                {
                    Map.Entry /*<String, Integer>*/ entry1     = (Map.Entry)iter1.next();
                    SpellCheckCorrection            correction = new SpellCheckCorrection();
                    correction.setOriginal(token);
                    correction.setCorrection((string)entry1.getKey());
                    correction.setNumberOfOccurences((int)entry1.getValue());
                    possibleCorrections.add(correction);
                }
                possibilityList.add(possibleCorrections);
            }

            int wrapSize = possibilityList.size();

            if (wrapSize == 0)
            {
                done = true;
            }
            else
            {
                correctionIndex = new int[wrapSize];
                for (int i = 0; i < wrapSize; i++)
                {
                    int suggestSize = ((List)possibilityList.get(i)).size();
                    if (suggestSize == 0)
                    {
                        done = true;
                        break;
                    }
                    correctionIndex[i] = 0;
                }
            }

            while (internalHasNext())
            {
                rankedPossibilityList.add(internalNext());
            }
            Collections.sort(rankedPossibilityList);
            rankedPossibilityIterator = rankedPossibilityList.iterator();
        }
示例#17
0
        /**
         * Converts the original query string to a collection of Lucene Tokens.
         * @param original the original query string
         * @return a Collection of Lucene Tokens
         */
        public override Collection /*<Token>*/ convert(string original)
        {
            if (original == null) // this can happen with q.alt = and no query
            {
                return(Collections.emptyList());
            }
            Collection /*<Token>*/ result = new ArrayList/*<Token>*/ ();
            //TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream
            Matcher     matcher = QUERY_REGEX.matcher(original);
            TokenStream stream;

            while (matcher.find())
            {
                string word = matcher.group(0);
                if (word.Equals("AND") == false && word.Equals("OR") == false)
                {
                    try {
                        stream = analyzer.reusableTokenStream("", new StringReader(word));
                        // TODO: support custom attributes
                        TermAttribute              termAtt    = (TermAttribute)stream.addAttribute(typeof(TermAttribute));
                        FlagsAttribute             flagsAtt   = (FlagsAttribute)stream.addAttribute(typeof(FlagsAttribute));
                        TypeAttribute              typeAtt    = (TypeAttribute)stream.addAttribute(typeof(TypeAttribute));
                        PayloadAttribute           payloadAtt = (PayloadAttribute)stream.addAttribute(typeof(PayloadAttribute));
                        PositionIncrementAttribute posIncAtt  = (PositionIncrementAttribute)stream.addAttribute(typeof(PositionIncrementAttribute));
                        stream.reset();
                        while (stream.incrementToken())
                        {
                            Token token = new Token();
                            token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
                            token.setStartOffset(matcher.start());
                            token.setEndOffset(matcher.end());
                            token.setFlags(flagsAtt.getFlags());
                            token.setType(typeAtt.type());
                            token.setPayload(payloadAtt.getPayload());
                            token.setPositionIncrement(posIncAtt.getPositionIncrement());
                            result.add(token);
                        }
                    }
#pragma warning disable 168
                    catch (IOException e)
                    {
                    }
#pragma warning restore 168
                }
            }
            return(result);
        }
示例#18
0
        //------------------------------------------------------------------------
        // These may be useful beyond test cases...
        //------------------------------------------------------------------------

        static List /*<Token>*/ getTokens(TokenStream tstream)
        {
            List /*<Token>*/ tokens = new ArrayList/*<Token>*/ ();

            while (true)
            {
#pragma warning disable 612
                Token t = tstream.next();
#pragma warning restore 612
                if (t == null)
                {
                    break;
                }
                tokens.add(t);
            }
            return(tokens);
        }
示例#19
0
        public void inform(ResourceLoader loader)
        {
            string synonyms = (string)args.get("synonyms");

            bool ignoreCase = getBoolean("ignoreCase", false);
            bool expand     = getBoolean("expand", true);

            //String tf = args.get("tokenizerFactory");
            //TokenizerFactory tokFactory = null;
            //if( tf != null ){
            //  tokFactory = loadTokenizerFactory( loader, tf, args );
            //}

            if (synonyms != null)
            {
                List /*<String>*/ wlist = null;
                try {
                    File synonymFile = new File(synonyms);
                    if (synonymFile.exists())
                    {
                        wlist = loader.getLines(synonyms);
                    }
                    else
                    {
                        List /*<String>*/ files = StrUtils.splitFileNames(synonyms);
                        wlist = new ArrayList/*<String>*/ ();
                        for (var iter = files.iterator(); iter.hasNext();)
                        {
                            string            file  = (string)iter.next();
                            List /*<String>*/ lines = loader.getLines(file.Trim());
                            wlist.addAll(lines);
                        }
                    }
                } catch (IOException e) {
                    throw new System.ApplicationException("Unexpected exception", e);
                }
                synMap = new SynonymMap(ignoreCase);
                parseRules(wlist, synMap, "=>", ",", expand, null);
            }
        }
示例#20
0
        public List /*<Token>*/ getTokList(SynonymMap dict, string input, bool includeOrig)
        {
            ArrayList /*<Token>*/ lst = new ArrayList/*<Token>*/ ();
            List        toks          = tokens(input);
            TokenStream ts            = new IteratorTokenStream(toks.iterator());

            SynonymFilter sf = new SynonymFilter(ts, dict);

            Token target = new Token(); // test with token reuse

            while (true)
            {
#pragma warning disable 612
                Token t = sf.next(target);
#pragma warning restore 612
                if (t == null)
                {
                    return(lst);
                }
                lst.add((Token)t.clone());
            }
        }
示例#21
0
        private List /*<String>*/ getLines(string resource, Charset charset)
        {
            BufferedReader         input = null;
            ArrayList /*<String>*/ lines;

            try
            {
                input = new BufferedReader(new InputStreamReader(openResource(resource),
                                                                 charset));

                lines = new ArrayList/*<String>*/ ();
                for (string word = null; (word = input.readLine()) != null;)
                {
                    // skip comments
                    if (word.StartsWith("#"))
                    {
                        continue;
                    }
                    word = word.Trim();
                    // skip blank lines
                    if (word.Length == 0)
                    {
                        continue;
                    }
                    lines.add(word);
                }
            }
            finally
            {
                if (input != null)
                {
                    input.close();
                }
            }
            return(lines);
        }
示例#22
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */

#pragma warning disable 672
        public override Token next(Token target)
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.hasNext())
                {
                    return((Token)replacement.next());
                }

                // common case fast-path of first token not matching anything
                Token firstTok = nextTok(target);
                if (firstTok == null)
                {
                    return(null);
                }
                SynonymMap result = (SynonymMap)(map.submap != null ? map.submap.get(firstTok.termBuffer(), 0, firstTok.termLength()) : null);
                if (result == null)
                {
                    return(firstTok);
                }

                // OK, we matched a token, so find the longest match.

                matched = new LinkedList/*<Token>*/ ();

                result = match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    return(firstTok);
                }

                // reuse, or create new one each time?
                ArrayList /*<Token>*/ generated = new ArrayList/*<Token>*/ (result.synonyms.Length + matched.size() + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                Token lastTok     = (Token)(matched.isEmpty() ? firstTok : matched.getLast());
                bool  includeOrig = result.includeOrig();

                Token origTok = includeOrig ? firstTok : null;
                int   origPos = firstTok.getPositionIncrement(); // position of origTok in the original stream
                int   repPos  = 0;                               // curr position in replacement token stream
                int   pos     = 0;                               // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token repTok = result.synonyms[i];
                    Token newTok = new Token(firstTok.startOffset(), lastTok.endOffset(), firstTok.type());
                    newTok.setTermBuffer(repTok.termBuffer(), 0, repTok.termLength());
                    repPos += repTok.getPositionIncrement();
                    if (i == 0)
                    {
                        repPos = origPos; // make position of first token equal to original
                    }
                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        origTok.setPositionIncrement(origPos - pos);
                        generated.add(origTok);
                        pos    += origTok.getPositionIncrement();
                        origTok = (Token)(matched.isEmpty() ? null : matched.removeFirst());
                        if (origTok != null)
                        {
                            origPos += origTok.getPositionIncrement();
                        }
                    }

                    newTok.setPositionIncrement(repPos - pos);
                    generated.add(newTok);
                    pos += newTok.getPositionIncrement();
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    origTok.setPositionIncrement(origPos - pos);
                    generated.add(origTok);
                    pos    += origTok.getPositionIncrement();
                    origTok = (Token)(matched.isEmpty() ? null : matched.removeFirst());
                    if (origTok != null)
                    {
                        origPos += origTok.getPositionIncrement();
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.iterator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
示例#23
0
        public void testReadMappingRules()
        {
            SynonymMap synMap;

            // (a)->[b]
            List /*<String>*/ rules = new ArrayList/*<String>*/ ();

            rules.add("a=>b");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
            Assert.AreEqual(1, synMap.submap.size());
            assertTokIncludes(synMap, "a", "b");

            // (a)->[c]
            // (b)->[c]
            rules.clear();
            rules.add("a,b=>c");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
            Assert.AreEqual(2, synMap.submap.size());
            assertTokIncludes(synMap, "a", "c");
            assertTokIncludes(synMap, "b", "c");

            // (a)->[b][c]
            rules.clear();
            rules.add("a=>b,c");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
            Assert.AreEqual(1, synMap.submap.size());
            assertTokIncludes(synMap, "a", "b");
            assertTokIncludes(synMap, "a", "c");

            // (a)->(b)->[a2]
            //      [a1]
            rules.clear();
            rules.add("a=>a1");
            rules.add("a b=>a2");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
            Assert.AreEqual(1, synMap.submap.size());
            assertTokIncludes(synMap, "a", "a1");
            Assert.AreEqual(1, getSubSynonymMap(synMap, "a").submap.size());
            assertTokIncludes(getSubSynonymMap(synMap, "a"), "b", "a2");

            // (a)->(b)->[a2]
            //      (c)->[a3]
            //      [a1]
            rules.clear();
            rules.add("a=>a1");
            rules.add("a b=>a2");
            rules.add("a c=>a3");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
            Assert.AreEqual(1, synMap.submap.size());
            assertTokIncludes(synMap, "a", "a1");
            Assert.AreEqual(2, getSubSynonymMap(synMap, "a").submap.size());
            assertTokIncludes(getSubSynonymMap(synMap, "a"), "b", "a2");
            assertTokIncludes(getSubSynonymMap(synMap, "a"), "c", "a3");

            // (a)->(b)->[a2]
            //      [a1]
            // (b)->(c)->[b2]
            //      [b1]
            rules.clear();
            rules.add("a=>a1");
            rules.add("a b=>a2");
            rules.add("b=>b1");
            rules.add("b c=>b2");
            synMap = new SynonymMap(true);
            SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
            Assert.AreEqual(2, synMap.submap.size());
            assertTokIncludes(synMap, "a", "a1");
            Assert.AreEqual(1, getSubSynonymMap(synMap, "a").submap.size());
            assertTokIncludes(getSubSynonymMap(synMap, "a"), "b", "a2");
            assertTokIncludes(synMap, "b", "b1");
            Assert.AreEqual(1, getSubSynonymMap(synMap, "b").submap.size());
            assertTokIncludes(getSubSynonymMap(synMap, "b"), "c", "b2");
        }