Пример #1
0
        /// <summary>
        /// Constructor for a markovstructure, only to be used by pipeline
        /// </summary>
        /// <param name="dic">Master dictionary, array of words</param>
        /// <param name="grms">Thread-safe master list of ngrams</param>
        /// <param name="prototypeChainlinks">Prototype of chain links, maps ngram-index
        /// to prototype of successors (which maps succeeding index to weight)</param>
        /// <param name="sds">Prototype of seed list, in hash map form for quick access</param>
        public MarkovStructure(string[] dic, ConcurrentQueue <NGram> grms,
                               ConcurrentDictionary <int, ConcurrentDictionary <int, int> > prototypeChainlinks,
                               ConcurrentDictionary <int, bool> sds)
        {
            // Pass along master dictionary
            dictionary = dic;

            // Populate master grams table
            grams = grms.ToArray();

            // Populate chain links
            // Index of any chain link is associated with the ngram of the same index
            chainLinks = new MarkovSegment[grams.Length];
            Parallel.For(0, grams.Length, (ind) => {
                chainLinks[ind] = new MarkovSegment(prototypeChainlinks[ind]);
            });

            // Populate list of seeds
            seeds = sds.Keys.ToArray();
        }
Пример #2
0
        // TODO: add summary, create unit test
        public MarkovSegment Combine(MarkovSegment other,
                                     int[] ngramOtherRemap,
                                     int ownNGramLength)
        {
            if (successors.Length == 0)
            {
                return(other);
            }
            if (other.successors.Length == 0)
            {
                return(this);
            }

            // Combined list, map
            // TODO: consider switching to BST structure (likely SortedSet) to prevent O(n) of list insert???
            List <NGramSuccessor> combinedSuccessors = new List <NGramSuccessor>(successors)
            {
                Capacity = successors.Length + other.successors.Length
            };
            Dictionary <int, int> successorMap = new Dictionary <int, int>(successors.Length + other.successors.Length);

            // Populate map with own
            int ind = 0;

            foreach (NGramSuccessor successor in successors)
            {
                // TODO: i dont think it should happen but each entry in here should be unique, maybe some sort of testing whether sucmap already has the index
                successorMap[successor.successorIndex] = ind++;
            }

            var reverseComparer = new NGramSuccessor.ReverseComparer();

            // Combine with other
            foreach (NGramSuccessor otherSuccessor in other.successors)
            {
                int remap = ngramOtherRemap[otherSuccessor.successorIndex];

                if (remap < ownNGramLength && successorMap.TryGetValue(remap, out int index))
                {
                    // Given succeeding gram is not unique to other, and within the own, succeeded the current ngram
                    // Combine the weights basically

                    // TODO: really wanna not have O(n) but idk
                    // First, grab the relevant successor and remove
                    var ownSuccessor = combinedSuccessors[index];

                    combinedSuccessors.RemoveAt(index);

                    // Combine weights
                    ownSuccessor.weight += otherSuccessor.weight;

                    // And add back (in sorted position)
                    combinedSuccessors.SortAdd(ownSuccessor, reverseComparer);
                }
                else
                {
                    // Either NGram is straight up unique to other, or the ngram is simply not a successor in this particular link
                    combinedSuccessors.SortAdd(new NGramSuccessor(remap, otherSuccessor.weight), reverseComparer);
                }
            }

            return(new MarkovSegment(combinedSuccessors));
        }
Пример #3
0
        public MarkovStructure Combine(MarkovStructure other)
        {
            // TOOD: add summary, create unit test

            // --- Dictionary combining

            // Combined dictionary, dicmap
            List <string> combinedDictionary = new List <string>(dictionary)
            {
                Capacity = dictionary.Length + other.dictionary.Length
            };
            Dictionary <string, int> dictionaryMap = new Dictionary <string, int>(dictionary.Length + other.dictionary.Length);

            // Populate dictionaryMap
            int i = 0;

            foreach (string w in dictionary)
            {
                dictionaryMap[w] = i++;
            }

            // Go through other's dictionary, populate onto combined
            foreach (string w in other.dictionary)
            {
                if (!dictionaryMap.ContainsKey(w))
                {
                    dictionaryMap[w] = combinedDictionary.Count;
                    combinedDictionary.Add(w);
                }
            }

            // Remap array that maps other's index to combined index (remap[i] = j where other[i] = combined[j])
            int[] dictionaryOtherRemap = new int[other.dictionary.Length];
            for (int index = 0; index < dictionaryOtherRemap.Length; ++index)
            {
                string othersCurrentWord = other.dictionary[index];
                dictionaryOtherRemap[index] = dictionaryMap[othersCurrentWord];
            }

            // --- NGram Combining

            // TODO: it's possible to combine ngrams and their links at the same time instead of doing more work

            // Combined ngrams, ngrammap
            List <NGram> combinedNGrams = new List <NGram>(grams)
            {
                Capacity = grams.Length + other.grams.Length
            };
            Dictionary <NGram, int> ngramMap = new Dictionary <NGram, int>(grams.Length + other.grams.Length);

            // Populate gram map with own grams
            i = 0;
            foreach (NGram gram in grams)
            {
                ngramMap[gram] = i++;
            }

            // Go through other's ngrams, populate onto combined, and populate ngram remap
            i = 0;
            int[] ngramOtherRemap = new int[other.grams.Length];
            // TODO: consider parallelizing, would involve an add queue and a lock potentially
            foreach (NGram gram in other.grams)
            {
                // Translate ngram using dictionary remap
                var   g     = gram.gram.Select((e) => (e == -1) ? -1 : dictionaryOtherRemap[e]);
                NGram remap = new NGram(g);

                if (ngramMap.TryGetValue(remap, out int index))
                {
                    // If remapped ngram is not unique, remap points to it in combined
                    ngramOtherRemap[i++] = index;
                }
                else
                {
                    // If translated ngram is unique, add it to the end, remap points to it
                    ngramOtherRemap[i++] = combinedNGrams.Count;
                    combinedNGrams.Add(remap);
                }
            }

            // --- Chain links combining

            //	Other's unique chain links will not need to be touched
            //		Can tell if it's unique by testing whether ngram remap index >= original.length
            //		Remember that ngrams and the links are associated together despite being in seperate arrays (i.e. ngram[0] corresponds with links[0])
            //	For those which need to be comebined, use MarkovSegment combine method

            MarkovSegment[] combinedLinks = new MarkovSegment[combinedNGrams.Count];

            // Populate combined_links with own
            Parallel.For(0, combinedLinks.Length, (index) => {
                combinedLinks[index] = chainLinks[index];
            });

            // Populate linkmap with other
            // TODO: make parallel when done testing
            // Parallel.For(0, other.chain_links.Length, (index) => {
            for (int index = 0; index < other.chainLinks.Length; ++index)
            {
                var otherSegment = other.chainLinks[index];

                int remap;
                if ((remap = ngramOtherRemap[index]) >= chainLinks.Length)
                {
                    // Unique link needs to be associated with its remap spot
                    combinedLinks[remap] = otherSegment;
                }
                else
                {
                    var ownSegment = chainLinks[remap];
                    // Otherwise, combine the segments and replace
                    var replace = ownSegment.Combine(otherSegment, ngramOtherRemap, grams.Length);

                    // Replace link in relevant structures
                    combinedLinks[remap] = replace;
                }
            }
            // });

            // TODO: remove when done testing
            if (combinedLinks.Contains(null))
            {
                Console.WriteLine("yeah crazy");
            }

            // --- Seed combining

            //	Run the other's seeds through ngram remap,
            //	Any of other's seeds which are unique (larger than original seed's length), add to end

            List <int> combinedSeeds = new List <int>(seeds)
            {
                Capacity = seeds.Length + other.seeds.Length
            };

            combinedSeeds.AddRange(from oseed in other.seeds
                                   where ngramOtherRemap[oseed] >= seeds.Length
                                   select oseed);

            // Put it all together
            return(new MarkovStructure(combinedDictionary.ToArray(),
                                       combinedNGrams.ToArray(),
                                       combinedLinks,
                                       combinedSeeds.ToArray()));
        }