예제 #1
0
        /// <summary>
        /// <para>Tokenize the sense's equiv, presented as hybrid text.</para>
        /// <para>During parsing, creates new word IDs as tokens come up.</para>
        /// </summary>
        public ReadOnlyCollection <EquivToken> Tokenize(HybridText txt)
        {
            List <EquivToken> res = new List <EquivToken>();
            int runIX             = -1;

            foreach (TextRun tr in txt.Runs)
            {
                ++runIX;
                if (tr is TextRunZho)
                {
                    int        idZho = wh.IdZho;
                    EquivToken eqt   = new EquivToken
                    {
                        TokenId     = idZho,
                        RunIx       = runIX,
                        StartInRun  = 0,
                        LengthInRun = 0,
                    };
                    res.Add(eqt);
                    continue;
                }
                string str = tr.GetPlainText();
                tokenizeRun(str, runIX, res);
            }
            return(new ReadOnlyCollection <EquivToken>(res));
        }
예제 #2
0
        /// <summary>
        /// Serialize to binary stream.
        /// </summary>
        public void Serialize(BinWriter bw)
        {
            bw.WriteInt(EntryId);
            bw.WriteInt(SenseIx);
            int equivTokenCount = EquivTokens.Count;

            bw.WriteInt(equivTokenCount);
            for (int i = 0; i != equivTokenCount; ++i)
            {
                EquivToken eqt = EquivTokens[i];
                if (eqt.RunIx < byte.MinValue || eqt.RunIx > byte.MaxValue)
                {
                    throw new Exception("RangeIx value out of byte range: " + eqt.StartInRun.ToString());
                }
                if (eqt.StartInRun < short.MinValue || eqt.StartInRun > short.MaxValue)
                {
                    throw new Exception("StartInSense value out of short range: " + eqt.StartInRun.ToString());
                }
                if (eqt.LengthInRun < short.MinValue || eqt.LengthInRun > short.MaxValue)
                {
                    throw new Exception("LengthInSense value out of short range: " + eqt.LengthInRun.ToString());
                }
                byte  rangeIx       = (byte)eqt.RunIx;
                short startInSense  = (short)eqt.StartInRun;
                short lengthInSense = (short)eqt.LengthInRun;
                bw.WriteInt(eqt.TokenId);
                bw.WriteByte(rangeIx);
                bw.WriteShort(startInSense);
                bw.WriteShort(lengthInSense);
            }
        }
예제 #3
0
        /// <summary>
        /// Ctor: read from binary stream.
        /// </summary>
        public TokenizedSense(BinReader br)
        {
            EntryId = br.ReadInt();
            SenseIx = br.ReadInt();
            int equivTokenCount = br.ReadInt();

            EquivTokens = new List <EquivToken>(equivTokenCount);
            for (int i = 0; i != equivTokenCount; ++i)
            {
                EquivToken eqt = new EquivToken();
                eqt.TokenId     = br.ReadInt();
                eqt.RunIx       = (int)br.ReadByte();
                eqt.StartInRun  = (int)br.ReadShort();
                eqt.LengthInRun = (int)br.ReadShort();
                EquivTokens.Add(eqt);
            }
        }
예제 #4
0
        /// <summary>
        /// Tokenizes a single plain text run.
        /// </summary>
        /// <param name="str">Text of the plain text run.</param>
        /// <param name="runIx">Current run index (to be stored in result).</param>
        /// <param name="tokens">List of tokens to append to.</param>
        private void tokenizeRun(string str, int runIx, List <EquivToken> tokens)
        {
            // One flag for each character.
            // If -1, character is outside of any token (e.g., whitespace, punctuation)
            // Otherwise, number indicates token index, from 0 upwards
            int[] flags = new int[str.Length];
            // At first, we assume entire text is a single token
            for (int i = 0; i != flags.Length; ++i)
            {
                flags[i] = 0;
            }
            // Split by space and dash
            int pos;

            // Trim from start
            for (pos = 0; pos != flags.Length; ++pos)
            {
                char c = str[pos];
                if (c != ' ' && c != '-')
                {
                    break;
                }
                flags[pos] = -1;
            }
            // Move on; when encountering ' ' or '-' again, segment
            for (; pos != flags.Length; ++pos)
            {
                char c = str[pos];
                // Space or dash here: mark as non-token; increase token IX of rest of string
                if (c == ' ' || c == '-')
                {
                    flags[pos] = -1;
                    for (int i = pos + 1; i < flags.Length; ++i)
                    {
                        ++flags[i];
                    }
                }
            }
            // Trim punctuation from start and end of each token
            pos = 0;
            while (pos != flags.Length)
            {
                // Skip non-tokens
                if (flags[pos] == -1)
                {
                    ++pos; continue;
                }
                // Find end of token
                int tokenStart  = pos;
                int tokenLength = 0;
                for (int i = pos; i != flags.Length; ++i)
                {
                    // Same token as long as index is the same
                    if (flags[i] == flags[pos])
                    {
                        ++tokenLength;
                    }
                    else
                    {
                        break;
                    }
                }
                // We'll move on in input by length of this token
                pos += tokenLength;
                // Trim punctuation from this token
                trimPunct(str, flags, tokenStart, tokenLength);
            }
            // We're done splitting and trimming, now create an EquivToken for each token.
            pos = 0;
            while (pos != flags.Length)
            {
                if (flags[pos] == -1)
                {
                    ++pos; continue;
                }
                // Find end of token
                int tokenStart  = pos;
                int tokenLength = 0;
                for (int i = pos; i != flags.Length; ++i)
                {
                    // Same token as long as index is the same
                    if (flags[i] == flags[pos])
                    {
                        ++tokenLength;
                    }
                    else
                    {
                        break;
                    }
                }
                pos += tokenLength;
                // Text of this token, lower-cased, OR *num*
                string strToken = str.Substring(tokenStart, tokenLength);
                if (reNumbers.IsMatch(strToken))
                {
                    strToken = WordHolder.TokenNum;
                }
                else
                {
                    strToken = strToken.ToLowerInvariant();
                }
                int        tokenId = wh.GetTokenId(strToken);
                EquivToken eqt     = new EquivToken
                {
                    TokenId     = tokenId,
                    RunIx       = runIx,
                    StartInRun  = tokenStart,
                    LengthInRun = tokenLength,
                };
                tokens.Add(eqt);
            }
        }
예제 #5
0
 /// <summary>
 /// Ctor: read from binary stream.
 /// </summary>
 public TokenizedSense(BinReader br)
 {
     EntryId = br.ReadInt();
     SenseIx = br.ReadInt();
     int equivTokenCount = br.ReadInt();
     EquivTokens = new List<EquivToken>(equivTokenCount);
     for (int i = 0; i != equivTokenCount; ++i)
     {
         EquivToken eqt = new EquivToken();
         eqt.TokenId = br.ReadInt();
         eqt.RunIx = (int)br.ReadByte();
         eqt.StartInRun = (int)br.ReadShort();
         eqt.LengthInRun = (int)br.ReadShort();
         EquivTokens.Add(eqt);
     }
 }