protected void AppendNewNode(TokenZipNode node, bool updateParents = true)
            bool hasChildren = this.ChildNodes.Count > 0;

            this.ChildTokenIds.Add(node.TokenId, node);
            // Debug.WriteLine("Appended: " + node.DebuggerDisplay + " to ChildNodes: " + ChildNodes.GetHashCode());

            this.DistanceToTop = Math.Min(1, this.DistanceToTop);
            TokenZipNode pNode = this;

            while (pNode != null)
                pNode.DescendantsCount += 1;
                pNode.CheckCompact      = true;
                if (!hasChildren)
                    pNode.DistanceToTop += 1;
                if (pNode.IsRoot)
                pNode = pNode.Parent;
        protected Tuple <float, float> CalculateAffinity(ClusterItem item, TokenZipNode rootZipNode, ClusteringConfig config, int itemsCount)
            Dictionary <int, int[]> itemTokenPlaceMap = item.GetTokenPlacesMap();

            Debug.Assert(itemTokenPlaceMap != null);

            int encounterThreshold = (int)Math.Floor(itemsCount * config.MinClusterAffinity * config.MinClusterAffinity);
            var itemTokensFound    = new HashSet <int>();
            Tuple <float, float> matchingAndTotalWeight = CalculateAffinity(0, itemTokenPlaceMap, rootZipNode, itemsCount, encounterThreshold, null, ref itemTokensFound);

            float foundWeight = 0.0f, notFoundWeight = 0.0f;
            int   seqIndex = 1;

            foreach (Token token in item.TokenIndex.Tokens)
                if (itemTokensFound.Contains(token.Id))
                    seqIndex     = 1;
                    foundWeight += 1;
                    notFoundWeight += (float)(1 * Math.Pow(seqIndex++, 1.25));
            return(new Tuple <float, float>(matchingAndTotalWeight.Item1 + foundWeight, matchingAndTotalWeight.Item2 + foundWeight + notFoundWeight));
        public static TokenZipNode CreateRoot(IEnumerable <Token> tokens, ClusteringVocabulary vocabulary = null)
            var node = new TokenZipNode(WildcardId, 0, null);

            node.Vocabulary = vocabulary;
        /// <summary>
        /// TokenZips zip chains are updated when adding an item to include the item's own sequences, extend TokenZips' sequences by an extra token and remote TokenZips chains that can be consolidated
        /// </summary>
        protected override void OnAddToCluster(ClusterItem item, float affinity)
            this.MaxScore     = null;
            this.TokenZipRoot = this.TokenZipRoot ?? TokenZipNode.CreateRoot(item.TokenIndex.Tokens, this.Vocabulary);


        protected virtual TokenZipNode CreateChildNode(int tokenId, float weight)
            var node = new TokenZipNode(tokenId, weight, this);

            node.DistanceToRoot = this.DistanceToRoot + 1;
            node.Vocabulary = this.Vocabulary;
        private void CompactChildrenToNode(TokenZipNode node)
            this.ChildTokenIds.Add(WildcardId, node);
            TokenZipNode pNode = this;

            while (pNode != null)
                pNode.CheckCompact = true;
                pNode = pNode.Parent;
        /// <summary>
        /// Adds the sequence to the zip tree, updates Encounters
        /// </summary>
        public void Append(IEnumerable <Token> tokens)
            TokenZipNode stepNode = this;

            foreach (Token token in tokens)
                TokenZipNode existingNode = null;
                stepNode = stepNode.AppendOrGetExisting(token, out existingNode) ? existingNode : this;
            if (stepNode != this)
        // the sum of all zip chains' last matching nodes' (weight chain * encounter / maxEncounter)
        public float GetScore(IEnumerable <Token> tokenSequence, int maxEncounters)// TODO: refactor maxEncounters to be counted within the method because repetitions within the same item's tokens can lead to more encounters than items
            float score = 0;

            TokenZipNode stepNode  = this;
            int          height    = 0;
            float        stepScore = 0;

            foreach (Token token in tokenSequence)
                TokenZipNode matchingChild;

                if (stepNode.ChildTokenIds.TryGetValue(token.Id, out matchingChild) ||
                    stepNode.ChildTokenIds.TryGetValue(WildcardId, out matchingChild))
                    stepNode   = matchingChild;
                    stepScore += matchingChild.TokenWeight;
                else if (this.ChildTokenIds.TryGetValue(token.Id, out matchingChild) ||
                         this.ChildTokenIds.TryGetValue(WildcardId, out matchingChild))
                    stepNode   = matchingChild;
                    stepScore += matchingChild.TokenWeight;
                    height     = 1;
                    if (stepNode != this)
                        score += stepScore * height * stepNode.Encounters / maxEncounters;

                    if (!this.ChildTokenIds.TryGetValue(token.Id, out stepNode))
                        stepNode  = this;
                        stepScore = 0;
                        height    = 0;

            score += stepScore * height * stepNode.Encounters / maxEncounters;

        // returns true when there is an existing node
        protected bool AppendOrGetExisting(Token token, out TokenZipNode existingNode)
            if (this.ChildTokenIds.TryGetValue(WildcardId, out existingNode))
                // Debug.WriteLine("Found *: " + existingNode.DebuggerDisplay);

            if (!this.ChildTokenIds.TryGetValue(token.Id, out existingNode))
                TokenZipNode node = this.CreateChildNode(token.Id, token.Weight);

                // Debug.WriteLine("Found: " + existingNode.DebuggerDisplay);

            return(existingNode != null);
        protected Tuple <float, float> CalculateAffinity(int distanceToRoot, Dictionary <int, int[]> itemTokenPlaceMap, TokenZipNode tokenNode, int itemsCount, int encounterThreshold, HashSet <int> potentialPlaces, ref HashSet <int> itemTokensFound)
            bool zipNodeMatches = (potentialPlaces?.Any() != false);

            if (zipNodeMatches && tokenNode.TokenId >= 0)

            float heightBonus = (float)Math.Log(distanceToRoot + 1);
            float tzWeight    = tokenNode.TokenWeight * distanceToRoot * tokenNode.Encounters / itemsCount;

            if (tokenNode.IsLeaf)
                if (zipNodeMatches)
                    // a complete matching chain gets emphasized by multiplying by its heigh
                    tzWeight *= heightBonus; // added bonus for reaching the top
                    return(new Tuple <float, float>(tzWeight, tzWeight));
                    return(new Tuple <float, float>(0, tzWeight));
                float matchingWeight = 0;
                float totalWeight    = 0;
                foreach (TokenZipNode tzTreeChild in tokenNode.ChildNodes)
                    int[] tokenPlaceMap   = null;
                    bool  startTokenFound = tzTreeChild.TokenId == TokenZipNode.WildcardId;
                    if (!startTokenFound)
                        for (int i = 0; i < 3; i++) // TODO: HACK for some race condition causing nullref
                                startTokenFound = itemTokenPlaceMap.TryGetValue(tzTreeChild.TokenId, out tokenPlaceMap);
                            catch when(i < 2)
                                System.Threading.Tasks.Task.Delay(1).Wait(); // ugly hack for thread-contested itemTokenPlaceMap

                    // Say our token ID is 5 and the item has  [2,3,4,5,4,3,5,7,9] tokens.
                    // tokenPlaceMap[5] will be [3,6]
                    // this means our next token must be 4 or 7 to match anything
                    // if it's 4 then we remove the 3 from [3,6], if it's 7 - remove 6
                    // our potentialPlaces items must be present (at least some) in the tokenPlace map with an offset of DistanceToRoot
                    // ie potentialPlaces[5,6] matches  tokenPlaceMap[5+tz,DistanceToRoot]
                    HashSet <int> nextPotentialPlaces = null;
                    if (!startTokenFound)
                        if (nextPotentialPlaces == null)
                            nextPotentialPlaces = new HashSet <int>();
                        if (tokenPlaceMap != null)
                            // if there are potential places for next tokens - take them into account
                            nextPotentialPlaces = potentialPlaces?.Any() == true
                                ? new HashSet <int>(potentialPlaces.Except(tokenPlaceMap.Select(t => t + distanceToRoot)))
                                : new HashSet <int>(tokenPlaceMap);

                    Tuple <float, float> nodeAffinity = CalculateAffinity(distanceToRoot + 1, itemTokenPlaceMap, tzTreeChild, itemsCount, encounterThreshold, nextPotentialPlaces, ref itemTokensFound);

                    if (tzTreeChild.Encounters + tzTreeChild.DistanceToRoot >= encounterThreshold)
                        if (nextPotentialPlaces?.Any() == true || tzTreeChild.TokenId == TokenZipNode.WildcardId)
                            matchingWeight += nodeAffinity.Item1;
                        totalWeight += nodeAffinity.Item2;

                if (matchingWeight > 0)
                    return(new Tuple <float, float>(matchingWeight + tzWeight, totalWeight + tzWeight));
                else if (tokenNode.Encounters + distanceToRoot >= encounterThreshold)
                    tzWeight *= heightBonus;
                    return(new Tuple <float, float>(tzWeight, totalWeight + tzWeight));
                    return(new Tuple <float, float>(0, 0));
        public bool Compact()
            if (!this.CheckCompact || this.ChildNodes.Count == 0)
            this.CheckCompact = false; // don't check anymore until a descendant changes

            if (this.ChildNodes.Count == 1)
                TokenZipNode temp = null;
                if (!this.IsRoot && this.DistanceToTop > 1 &&
                    (temp = this.Parent.ChildNodes.FirstOrDefault(cn => cn.ChildTokenIds.ContainsKey(this.TokenId))) != null &&
                    temp.DistanceToTop == this.DistanceToTop && temp.Encounters >= this.Encounters &&
                    temp.GetDescendants().Select(d => d.TokenId).SequenceEqual(this.GetDescendants().Select(d => d.TokenId)))
                    // this node's sequence is already found under its parent node
                if (!this.IsRoot && this.Parent.ChildNodes.Count >= this.DescendantsCount &&
                    this.DistanceToTop > 1 && this.DescendantsCount == this.DistanceToTop &&
                    .Select((dn, i) => this.Parent.ChildTokenIds.TryGetValue(dn.TokenId, out temp) &&
                            temp.Encounters + dn.Encounters == this.Encounters)
                    .All((r) => r))
                    // previously unmerged sequence
                    // A2 B1
                    // B1
                    // --> A2 B2
                    IEnumerable <TokenZipNode> descNodes = this.GetDescendants();
                    foreach (TokenZipNode dn in descNodes)
                        dn.Encounters = this.Encounters;
            else if (this.ChildNodes.Count > 1)
                if (this.DistanceToTop == 1) // the children are leaves
                    // look for common suffix
                    if (this.Parent != null && this.Encounters == this.ChildNodes.Sum(c => c.Encounters) + 1) // +1 because this node was first encountered without any children
                        if (!this.ChildNodes.Any(c => c.TokenId == this.TokenId))
                            // this is a leaf-holding branch with variable children we can compact it
                            // but only if the sum of all children's encounters are equal to this node's encounters
                            float        freqAvgWeight = this.ChildNodes.Sum(c => c.Encounters * c.TokenWeight) / (this.Encounters - 1);
                            TokenZipNode node          = this.CreateChildNode(WildcardId, freqAvgWeight);
                            node.Encounters = this.ChildNodes.Sum(c => c.Encounters);
                else if (this.DistanceToTop == 3 && this.ChildNodes.All(c => c.DistanceToTop == 2))
                    // look for common prefix...
                    IEnumerable <TokenZipNode> grandChildren = this.ChildNodes.SelectMany(c => c.ChildNodes);

                    TokenZipNode oneGc = grandChildren.First();
                    if (!this.ChildNodes.Any(c => c.TokenId == oneGc.TokenId))
                        int  sumEncounters        = 0;
                        bool allGrandchildrenSame = grandChildren.All(gc =>
                            sumEncounters += gc.Encounters;
                            return(gc.TokenId == oneGc.TokenId);

                        if (allGrandchildrenSame &&
                            sumEncounters == this.ChildNodes.Sum(c => c.Encounters) &&
                            (this.Parent == null || this.Encounters == sumEncounters))
                            // all grandchildren are the same, we can compact the children into a wildcard
                            // and the sum encounters of all grandchildren equal that of the child encounters and this node's encounters
                            oneGc.Encounters = sumEncounters;
                            float        freqAvgWeight = this.ChildNodes.Average(c => c.Encounters * c.TokenWeight / this.Encounters);
                            TokenZipNode node          = this.CreateChildNode(WildcardId, freqAvgWeight);
                            node.Encounters = sumEncounters;
                            TokenZipNode gcNode = node.CreateChildNode(oneGc.TokenId, oneGc.TokenWeight);
                            gcNode.Encounters = node.Encounters - 1;

                            this.CompactChildrenToNode(node); // replace all child nodes with a wildcard node that has the grandchild node in it

                else if (!this.IsRoot && this.DistanceToTop > 3 && this.ChildNodes.All(c => c.DistanceToTop > 3 && c.DistanceToTop == c.DescendantsCount))
                    // long prefix
                    IEnumerable <TokenZipNode> steps = this.ChildNodes;
                    int stepTokenId = this.ChildNodes.First().TokenId;
                    if (steps.All(s => s.TokenId == stepTokenId))
                        // all shoots have the same first token, join them

            bool oneChildCompacted = false;
            int  ci = 0;

            while (ci < this.ChildNodes.Count)
                TokenZipNode ch = this.ChildNodes[ci];
                if (ch.Compact())
                    oneChildCompacted = true;
 protected TokenZipNode(int tokenId, float tokenWeight, TokenZipNode parent)
     this.TokenId     = tokenId;
     this.TokenWeight = tokenWeight;
     this.Parent      = parent;
 protected TokenZipNode(Token token, TokenZipNode parent)
     this.TokenId     = token.Id;
     this.TokenWeight = token.Weight;
     this.Parent      = parent;