public void FullTokenizeTest(string input, string[] expected) { string[] result = BertTokenizer.Fulltokenize(input, vocabularyTable); ArrayEqual(expected, result); }
private ContentData PreProcess(string query, string content) { var queryTokens = BertTokenizer.Fulltokenize(query, vocabularyTable) .Take(MAX_QUERY_LENTH); var contentWords = content.Split((char c) => c.IsBertWhiteSpace(), StringSplitOptions.RemoveEmptyEntries); var contentTokenIdxToWordIdxMapping = new List <int>(); var contentTokens = new List <string>(); for (int i = 0; i < contentWords.Length; i++) { var wordTokens = BertTokenizer.Fulltokenize(contentWords[i], vocabularyTable); foreach (var subToken in wordTokens) { contentTokenIdxToWordIdxMapping.Add(i); contentTokens.Add(subToken); } } // -3 accounts for [CLS], [SEP] and [SEP] // int maxContentLen = MAX_SEQ_LENTH - queryTokens.Count() - 3; // contentTokens.AddRange(new string[MAX_SEQ_LENTH - queryTokens.Count() - 3 - contentTokens.Count]); var tokens = new List <string>(MAX_SEQ_LENTH); var segmentIds = new List <Int32>(MAX_SEQ_LENTH); // Map token index to original index (in feature.origTokens). var tokenIdxToWordIdxMapping = new Dictionary <int, int>(); // Start of generating the `InputFeatures`. tokens.Add("[CLS]"); segmentIds.Add(0); // For query input. foreach (string t in queryTokens) { tokens.Add(t); segmentIds.Add(0); } // For separation. tokens.Add("[SEP]"); segmentIds.Add(0); // For text input. for (int i = 0; i < contentTokens.Count; i++) { tokens.Add(contentTokens[i]); segmentIds.Add(1); tokenIdxToWordIdxMapping[tokens.Count] = contentTokenIdxToWordIdxMapping[i]; } // For ending mark. tokens.Add("[SEP]"); segmentIds.Add(1); ResetArray(inputs0); ResetArray(inputs1); ResetArray(inputs2); for (int i = 0; i < tokens.Count; i++) { // Input IDs inputs0[i] = vocabularyTable[tokens[i]]; // Input Mask inputs1[i] = 1; // Segment IDs inputs2[i] = segmentIds[i]; } return(new ContentData() { contentWords = contentWords, tokenIdxToWordIdxMapping = tokenIdxToWordIdxMapping, originalContent = content, }); }