public StaticStringTrieGraph CreateStaticGraph(bool isCopyCharTypeData = true) { StaticStringTrieGraph graph = new StaticStringTrieGraph(); graph.Create(this, isCopyCharTypeData); return(graph); }
/// <summary> /// 创建 Trie 图 /// </summary> /// <param name="staticGraph"></param> /// <param name="graph"></param> internal void Create(StaticStringTrieGraph staticGraph, StringTrieGraph graph) { cache = DictionaryCreator.CreateOnly <StringTrieGraph.Node, int>(); StringTrieGraph.Node boot = graph.Boot; bool isCreate = false; Monitor.Enter(NodePool.Lock); try { staticGraph.setBoot(create(boot)); createLink(boot); isCreate = true; } finally { if (isCreate || cache == null) { Monitor.Exit(NodePool.Lock); } else { try { staticGraph.CancelBuilder(); foreach (int index in cache.Values) { if (index == nodeIndex) { nodeIndex = 0; } NodePool.Pool[index >> ArrayPool.ArraySizeBit][index & ArrayPool.ArraySizeAnd].CancelBuilder(); } NodePool.FreeNoLock(cache.Values); if (nodeIndex != 0) { NodePool.Pool[nodeIndex >> ArrayPool.ArraySizeBit][nodeIndex & ArrayPool.ArraySizeAnd].CancelBuilder(); NodePool.FreeNoLock(nodeIndex); } } finally { Monitor.Exit(NodePool.Lock); } } } }
/// <summary> /// 绑定结果池的分词搜索器 /// </summary> /// <param name="trieGraph">绑定静态节点池的字符串 Trie 图</param> /// <param name="flags">搜索选项</param> protected StaticSearcher(StaticStringTrieGraph trieGraph, SearchFlags flags) { this.trieGraph = trieGraph ?? StaticStringTrieGraph.Null; this.flags = flags; QueueWait.Set(0); }
/// <summary> /// 获取文本分词结果 /// </summary> /// <param name="text"></param> private void getRemoveResult(string text) { removeResult.Clear(); formatLength = text.Length; formatText = AutoCSer.Extension.StringExtension.FastAllocateString(formatLength + 1); fixed(char *textFixed = formatText) { Simplified.FormatNotEmpty(text, textFixed, formatLength); matchs.Length = 0; char *start = textFixed, end = textFixed + formatLength; if (charTypeData != StringTrieGraph.DefaultCharTypeData.Byte) { StaticStringTrieGraph trieGraph = searcher.trieGraph; int count, index, startIndex; char trieGraphHeadChar = trieGraph.AnyHeadChar; byte type, nextType; bool isMatchMap = false; do { if (((type = charTypeData[*start]) & StringTrieGraph.TrieGraphHeadFlag) == 0) { *end = trieGraphHeadChar; do { if ((type & ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) == ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) { removeResult.Add(new SubString((int)(start - textFixed), 1, formatText)); } if (((nextType = charTypeData[*++start]) & StringTrieGraph.TrieGraphHeadFlag) != 0) { if (start == end) { goto TRIEGRAPHEND; } if ((nextType & (byte)WordType.Chinese) != 0 || (type & nextType & ((byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0) { goto TRIEGRAPH; } } type = nextType; }while (true); } TRIEGRAPH: *end = ' '; char *segment = start, segmentEnd = (type & StringTrieGraph.TrieGraphEndFlag) == 0 ? start++ : ++start; while (((type = charTypeData[*start]) & (byte)WordType.TrieGraph) != 0) { ++start; if ((type & StringTrieGraph.TrieGraphEndFlag) != 0) { segmentEnd = start; } } if ((int)(start - segment) == 1) { if ((type & (byte)WordType.Chinese) != 0) { removeResult.Add(new SubString((int)(segment - textFixed), 1, formatText)); } } else { if (segment != segmentEnd) { matchs.Length = 0; trieGraph.LeftRightMatchs(segment, segmentEnd, ref matchs); if ((count = matchs.Length) == 0) { segmentEnd = segment; goto CHINESE; } if (!isMatchMap) { checkMatchMap(); isMatchMap = true; } startIndex = (int)(segment - textFixed); foreach (KeyValue <int, int> value in matchs.Array) { removeResult.Add(new SubString(index = value.Key + startIndex, value.Value, formatText)); matchMap.Set(index, value.Value); if (--count == 0) { break; } } index = (int)(segmentEnd - textFixed); do { if (matchMap.Get(startIndex) == 0 && (charTypeData[textFixed[startIndex]] & (byte)WordType.Chinese) != 0) { removeResult.Add(new SubString(startIndex, 1, formatText)); } }while (++startIndex != index); } CHINESE: while (segmentEnd != start) { if ((charTypeData[*segmentEnd] & (byte)WordType.Chinese) != 0) { removeResult.Add(new SubString((int)(segmentEnd - textFixed), 1, formatText)); } ++segmentEnd; } } }while (start != end); TRIEGRAPHEND: start = textFixed; } do { byte type = charTypeData[*start]; if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0) { *end = '0'; do { type = charTypeData[*++start]; if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0) { if (start == end) { return; } goto OTHER; } }while (true); } OTHER: *end = ' '; if ((type & (byte)WordType.Chinese) != 0) { do { if ((type & (byte)WordType.TrieGraph) == 0) { removeResult.Add(new SubString((int)(start - textFixed), 1, formatText)); } }while (((type = charTypeData[*++start]) & (byte)WordType.Chinese) != 0); } else { char *segment = start; if ((type & (byte)WordType.OtherLetter) == 0) { char *word = start; for (byte newType = charTypeData[*++start]; (newType &= ((byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0; newType = charTypeData[*++start]) { if (type != newType) { if (type != (byte)WordType.Keep) { removeResult.Add(new SubString((int)(word - textFixed), (int)(start - word), formatText)); } type = newType; word = start; } } } else { while ((charTypeData[*++start] & (byte)WordType.OtherLetter) != 0) { ; } } removeResult.Add(new SubString((int)(segment - textFixed), (int)(start - segment), formatText)); } }while (start != end); } }
/// <summary> /// 获取文本分词结果 /// </summary> /// <param name="text"></param> protected void getResult(string text) { result.Clear(); formatText = AutoCSer.Extension.StringExtension.FastAllocateString((foramtLength = text.Length) + 1); fixed(char *textFixed = formatText) { Simplified.FormatNotEmpty(text, textFixed, foramtLength); words.Length = matchs.Length = 0; char *start = textFixed, end = textFixed + foramtLength; byte type, nextType, wordType; bool isMatchMap = false; if (charTypeData != StringTrieGraph.DefaultCharTypeData.Byte) { StaticStringTrieGraph trieGraph = searcher.trieGraph; int count, index, startIndex; char trieGraphHeadChar = trieGraph.AnyHeadChar; do { if (((type = charTypeData[*start]) & StringTrieGraph.TrieGraphHeadFlag) == 0) { *end = trieGraphHeadChar; do { if ((type & ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) == ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) { addWord((int)(start - textFixed), 1, WordType.Chinese); } if (((nextType = charTypeData[*++start]) & StringTrieGraph.TrieGraphHeadFlag) != 0) { if (start == end) { goto TRIEGRAPHEND; } if ((nextType & (byte)WordType.Chinese) != 0 || (type & nextType & ((byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0) { goto TRIEGRAPH; } } type = nextType; }while (true); } TRIEGRAPH: *end = ' '; char *segment = start, segmentEnd = (type & StringTrieGraph.TrieGraphEndFlag) == 0 ? start++ : ++start; while (((type = charTypeData[*start]) & (byte)WordType.TrieGraph) != 0) { ++start; if ((type & StringTrieGraph.TrieGraphEndFlag) != 0) { segmentEnd = start; } } if ((int)(start - segment) == 1) { if ((type & (byte)WordType.Chinese) != 0) { addWord((int)(segment - textFixed), 1, (type & (byte)WordType.TrieGraph) != 0 ? WordType.TrieGraph : WordType.Chinese); } } else { if (segment != segmentEnd) { matchs.Length = 0; trieGraph.LeftRightMatchs(segment, segmentEnd, ref matchs); if ((count = matchs.Length) == 0) { segmentEnd = segment; goto CHINESE; } if (!isMatchMap) { checkMatchMap(); isMatchMap = true; } startIndex = (int)(segment - textFixed); foreach (KeyValue <int, int> value in matchs.Array) { addWord(index = value.Key + startIndex, value.Value, WordType.TrieGraph); matchMap.Set(index, value.Value); if (--count == 0) { break; } } index = (int)(segmentEnd - textFixed); do { if (matchMap.Get(startIndex) == 0 && (charTypeData[textFixed[startIndex]] & (byte)WordType.Chinese) != 0) { addWord(startIndex, 1, WordType.Chinese); } }while (++startIndex != index); } CHINESE: while (segmentEnd != start) { if ((charTypeData[*segmentEnd] & (byte)WordType.Chinese) != 0) { addWord((int)(segmentEnd - textFixed), 1, WordType.Chinese); } ++segmentEnd; } } }while (start != end); TRIEGRAPHEND: start = textFixed; } do { type = charTypeData[*start]; if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0) { *end = '0'; do { type = charTypeData[*++start]; if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0) { if (start == end) { goto END; } goto OTHER; } }while (true); } OTHER: *end = ' '; if ((type & (byte)WordType.Chinese) != 0) { do { if ((type & (byte)WordType.TrieGraph) == 0) { addWord((int)(start - textFixed), 1, WordType.Chinese); } }while (((type = charTypeData[*++start]) & (byte)WordType.Chinese) != 0); } else { char *segment = start; if ((type & (byte)WordType.OtherLetter) == 0) { char *word = start; wordType = type; for (nextType = charTypeData[*++start]; (nextType &= ((byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0; nextType = charTypeData[*++start]) { if (type != nextType) { if (type != (byte)WordType.Keep) { addWord((int)(word - textFixed), (int)(start - word), (WordType)type); } wordType |= nextType; type = nextType; word = start; } } if (word != segment && type != (byte)WordType.Keep) { addWord((int)(word - textFixed), (int)(start - word), (WordType)type); } addWord((int)(segment - textFixed), (int)(start - segment), (WordType)wordType); } else { while ((charTypeData[*++start] & (byte)WordType.OtherLetter) != 0) { ; } addWord((int)(segment - textFixed), (int)(start - segment), WordType.OtherLetter); } } }while (start != end); END: if (words.Length != 0) { int count = words.Length, textLength = text.Length; if ((searcher.flags & SearchFlags.ResultIndexs) == 0) { foreach (KeyValue <SubString, WordType> word in words.Array) { result[word.Key] = new ResultIndexLeftArray { WordType = word.Value, TextLength = textLength }; if (--count == 0) { break; } } } else { ResultIndexLeftArray indexs; foreach (KeyValue <SubString, WordType> word in words.Array) { HashString wordKey = word.Key; if (result.TryGetValue(wordKey, out indexs)) { indexs.Indexs.Add(word.Key.Start); result[wordKey] = indexs; } else { indexs.Set(textLength, word.Value); if (indexArrays.Length != 0) { indexs.Indexs.Set(indexArrays.UnsafePopOnly(), 0); } indexs.Indexs.Add(word.Key.Start); result.Add(wordKey, indexs); } if (--count == 0) { break; } } foreach (ResultIndexLeftArray indexArray in result.Values) { indexArray.Indexs.sort(); } } } } }
/// <summary> /// 获取文本分词结果 /// </summary> /// <param name="textFixed"></param> /// <param name="isAllMatch">是否要求关键字全匹配</param> /// <returns></returns> private bool get(char *textFixed, bool isAllMatch) { char *start = textFixed, end = textFixed + formatLength; try { matchs.Length = 0; byte type, nextType; bool isMatchMap = false; if (charTypeData != StringTrieGraph.DefaultCharTypeData.Byte) { StaticStringTrieGraph trieGraph = searcher.trieGraph; int count, index, startIndex; char trieGraphHeadChar = trieGraph.AnyHeadChar; do { if (((type = charTypeData[*start]) & StringTrieGraph.TrieGraphHeadFlag) == 0) { *end = trieGraphHeadChar; do { if ((type & ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) == ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) { if (!checkAddWord((int)(start - textFixed), 1) && isAllMatch) { return(false); } } if (((nextType = charTypeData[*++start]) & StringTrieGraph.TrieGraphHeadFlag) != 0) { if (start == end) { goto TRIEGRAPHEND; } if ((nextType & (byte)WordType.Chinese) != 0 || (type & nextType & ((byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0) { goto TRIEGRAPH; } } type = nextType; }while (true); } TRIEGRAPH: *end = ' '; char *segment = start, segmentEnd = (type & StringTrieGraph.TrieGraphEndFlag) == 0 ? start++ : ++start; while (((type = charTypeData[*start]) & (byte)WordType.TrieGraph) != 0) { ++start; if ((type & StringTrieGraph.TrieGraphEndFlag) != 0) { segmentEnd = start; } } if ((int)(start - segment) == 1) { if ((type & (byte)WordType.Chinese) != 0) { if (!checkAddWord((int)(segment - textFixed), 1) && isAllMatch) { return(false); } } } else { if (segment != segmentEnd) { matchs.Length = 0; trieGraph.LeftRightMatchs(segment, segmentEnd, ref matchs); if ((count = matchs.Length) == 0) { segmentEnd = segment; goto CHINESE; } if (!isMatchMap) { checkMatchMap(); isMatchMap = true; } startIndex = (int)(segment - textFixed); foreach (KeyValue <int, int> value in matchs.Array) { if (!checkAddWord(index = value.Key + startIndex, value.Value) && isAllMatch) { return(false); } matchMap.Set(index, value.Value); if (--count == 0) { break; } } index = (int)(segmentEnd - textFixed); do { if (matchMap.Get(startIndex) == 0 && (charTypeData[textFixed[startIndex]] & (byte)WordType.Chinese) != 0) { if (!checkAddWord(startIndex, 1) && isAllMatch) { return(false); } } }while (++startIndex != index); } CHINESE: while (segmentEnd != start) { if ((charTypeData[*segmentEnd] & (byte)WordType.Chinese) != 0) { if (!checkAddWord((int)(segmentEnd - textFixed), 1) && isAllMatch) { return(false); } } ++segmentEnd; } } }while (start != end); TRIEGRAPHEND: start = textFixed; } do { type = charTypeData[*start]; if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0) { *end = '0'; do { type = charTypeData[*++start]; if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0) { if (start == end) { return(true); } goto OTHER; } }while (true); } OTHER: *end = ' '; if ((type & (byte)WordType.Chinese) != 0) { do { if ((type & (byte)WordType.TrieGraph) == 0 && !isAllMatch) { checkAddWord((int)(start - textFixed), 1); } }while (((type = charTypeData[*++start]) & (byte)WordType.Chinese) != 0); } else { char *segment = start; if ((type & (byte)WordType.OtherLetter) == 0) { char *word = start; bool isWord = false; for (nextType = charTypeData[*++start]; (nextType &= ((byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0; nextType = charTypeData[*++start]) { if (type != nextType) { if (type != (byte)WordType.Keep) { if (!checkAddWord((int)(word - textFixed), (int)(start - word)) && isAllMatch) { return(false); } isWord = true; } type = nextType; word = start; } } if (word != segment && type != (byte)WordType.Keep) { if (!checkAddWord((int)(word - textFixed), (int)(start - word)) && isAllMatch) { return(false); } isWord = true; } if (!isWord) { if (!checkAddWord((int)(segment - textFixed), (int)(start - segment)) && isAllMatch) { return(false); } } } else { while ((charTypeData[*++start] & (byte)WordType.OtherLetter) != 0) { ; } if (!checkAddWord((int)(segment - textFixed), (int)(start - segment)) && isAllMatch) { return(false); } } } }while (start != end); } finally { *end = ' '; } return(true); }