Exemple #1
0
        public void WordPieceTokenizeWithCutomCocabTest(string input, string[] expected)
        {
            var vocabText = @"[UNK]
[CLS]
[SEP]
want
##want
##ed
wa
un
runn
##ing";
            var table     = Bert.LoadVocabularies(vocabText);

            Assert.True(table.ContainsKey("[UNK]"));
            Assert.True(table.ContainsKey("want"));
            Assert.True(table.ContainsKey("##want"));
            ArrayEqual(expected, BertTokenizer.WordPieceTokenize(input, table));
        }
Exemple #2
0
 public void WordPieceTokenizeTest(string input, string[] expected)
 {
     ArrayEqual(expected, BertTokenizer.WordPieceTokenize(input, vocabularyTable));
 }
Exemple #3
0
 public void BasicTokenizeTest(string input, string[] expected)
 {
     string[] result = BertTokenizer.BasicTokenize(input);
     ArrayEqual(expected, result);
 }
Exemple #4
0
 public void FullTokenizeTest(string input, string[] expected)
 {
     string[] result = BertTokenizer.Fulltokenize(input, vocabularyTable);
     ArrayEqual(expected, result);
 }
        private ContentData PreProcess(string query, string content)
        {
            var queryTokens = BertTokenizer.Fulltokenize(query, vocabularyTable)
                              .Take(MAX_QUERY_LENTH);

            var contentWords = content.Split((char c) => c.IsBertWhiteSpace(), StringSplitOptions.RemoveEmptyEntries);
            var contentTokenIdxToWordIdxMapping = new List <int>();
            var contentTokens = new List <string>();

            for (int i = 0; i < contentWords.Length; i++)
            {
                var wordTokens = BertTokenizer.Fulltokenize(contentWords[i], vocabularyTable);
                foreach (var subToken in wordTokens)
                {
                    contentTokenIdxToWordIdxMapping.Add(i);
                    contentTokens.Add(subToken);
                }
            }

            // -3 accounts for [CLS], [SEP] and [SEP]
            // int maxContentLen = MAX_SEQ_LENTH - queryTokens.Count() - 3;
            //  contentTokens.AddRange(new string[MAX_SEQ_LENTH - queryTokens.Count() - 3 - contentTokens.Count]);

            var tokens     = new List <string>(MAX_SEQ_LENTH);
            var segmentIds = new List <Int32>(MAX_SEQ_LENTH);

            // Map token index to original index (in feature.origTokens).
            var tokenIdxToWordIdxMapping = new Dictionary <int, int>();

            // Start of generating the `InputFeatures`.
            tokens.Add("[CLS]");
            segmentIds.Add(0);

            // For query input.
            foreach (string t in queryTokens)
            {
                tokens.Add(t);
                segmentIds.Add(0);
            }

            // For separation.
            tokens.Add("[SEP]");
            segmentIds.Add(0);

            // For text input.
            for (int i = 0; i < contentTokens.Count; i++)
            {
                tokens.Add(contentTokens[i]);
                segmentIds.Add(1);
                tokenIdxToWordIdxMapping[tokens.Count] = contentTokenIdxToWordIdxMapping[i];
            }

            // For ending mark.
            tokens.Add("[SEP]");
            segmentIds.Add(1);

            ResetArray(inputs0);
            ResetArray(inputs1);
            ResetArray(inputs2);
            for (int i = 0; i < tokens.Count; i++)
            {
                // Input IDs
                inputs0[i] = vocabularyTable[tokens[i]];
                // Input Mask
                inputs1[i] = 1;
                // Segment IDs
                inputs2[i] = segmentIds[i];
            }

            return(new ContentData()
            {
                contentWords = contentWords,
                tokenIdxToWordIdxMapping = tokenIdxToWordIdxMapping,
                originalContent = content,
            });
        }