Example #1
0
        static void Main()
        {
            Ingesting.IngestOptions opts = new Ingesting.IngestOptions {
                infileCSV    = "m.csv",
                regexFilters = new Tuple <string, string>[] {
                    Tuple.Create(@"\b?@\S+\s*", " "),                // filter @person's
                    Tuple.Create(@"\b?https?.*\s*", " "),            // filter URLs
                    Tuple.Create(@"`+[^`]*`+", " "),                 // Filter code blocks
                    Tuple.Create(@"[\*]+", " "),                     // Filter italics and bold and all
                    Tuple.Create(@"\b?\W+\s*", " "),                 // filter whole non-words (e.g. "->")

                    // THESE ALWAYS GO LAST
                    Tuple.Create(@"^\s+", ""),                    // opening spaces
                    Tuple.Create(@"\s+$", ""),                    // closing spaces
                    Tuple.Create(@"\s{2,}", " ")                  // excess space (also handles newlines)
                },
                gramSize = 2
            };

            Ingesting.MarkovPipe pipe = new Ingesting.MarkovPipe(opts);
            pipe.Run();
            var a = pipe.Result;

#if true
#else
            Structs.MarkovStructure A = Structs.MarkovStructure.ReadFile("A.markov");
            Structs.MarkovStructure H = Structs.MarkovStructure.ReadFile("H.markov");
            var C = A.Combine(H);
            Console.WriteLine(H.ToString());
#endif

            /* Unit test planning for MarkovStructure combine functions
             *	Unit tests for MarkovStructure Combine
             *		Test combining the same structure (small test, real test)
             *		Test combining completely different structures (small test, not really sure if it's possible to have a real test)
             *		Test combining structures with SOME overlap (small, large)
             *	Unit test for MarkovSegment Combine
             *
             *	Tests:
             *		A: small same structure -- OK
             *		B: big same structure
             *		C: small different
             *		~~D: big different?~~
             *		E: small overlap
             *		F: big overlap
             *		G: any with empty (if possible)
             *		H: based on A, rearrange dictionary and update structures to point to regular thing -- OK
             *		I: final test, when ingesting pipeline is reworked to handle multiple users, combine them all together`
             */

            /*	TODO: Plan out fully fledged options and implement them
             *		-	Maybe ingest unique regex filters, can be some sort of input csv file idk
             *		-	Maybe split runtime functionality into ingest and create from file
             *		-		Maybe ingest has a flag to create from ingested
             *		-	For creating, read from stdin or from passed "-input" parameter to produce in some sed shell like quality
             */
        }
Example #2
0
        public MarkovStructure Combine(MarkovStructure other)
        {
            // TOOD: add summary, create unit test

            // --- Dictionary combining

            // Combined dictionary, dicmap
            List <string> combinedDictionary = new List <string>(dictionary)
            {
                Capacity = dictionary.Length + other.dictionary.Length
            };
            Dictionary <string, int> dictionaryMap = new Dictionary <string, int>(dictionary.Length + other.dictionary.Length);

            // Populate dictionaryMap
            int i = 0;

            foreach (string w in dictionary)
            {
                dictionaryMap[w] = i++;
            }

            // Go through other's dictionary, populate onto combined
            foreach (string w in other.dictionary)
            {
                if (!dictionaryMap.ContainsKey(w))
                {
                    dictionaryMap[w] = combinedDictionary.Count;
                    combinedDictionary.Add(w);
                }
            }

            // Remap array that maps other's index to combined index (remap[i] = j where other[i] = combined[j])
            int[] dictionaryOtherRemap = new int[other.dictionary.Length];
            for (int index = 0; index < dictionaryOtherRemap.Length; ++index)
            {
                string othersCurrentWord = other.dictionary[index];
                dictionaryOtherRemap[index] = dictionaryMap[othersCurrentWord];
            }

            // --- NGram Combining

            // TODO: it's possible to combine ngrams and their links at the same time instead of doing more work

            // Combined ngrams, ngrammap
            List <NGram> combinedNGrams = new List <NGram>(grams)
            {
                Capacity = grams.Length + other.grams.Length
            };
            Dictionary <NGram, int> ngramMap = new Dictionary <NGram, int>(grams.Length + other.grams.Length);

            // Populate gram map with own grams
            i = 0;
            foreach (NGram gram in grams)
            {
                ngramMap[gram] = i++;
            }

            // Go through other's ngrams, populate onto combined, and populate ngram remap
            i = 0;
            int[] ngramOtherRemap = new int[other.grams.Length];
            // TODO: consider parallelizing, would involve an add queue and a lock potentially
            foreach (NGram gram in other.grams)
            {
                // Translate ngram using dictionary remap
                var   g     = gram.gram.Select((e) => (e == -1) ? -1 : dictionaryOtherRemap[e]);
                NGram remap = new NGram(g);

                if (ngramMap.TryGetValue(remap, out int index))
                {
                    // If remapped ngram is not unique, remap points to it in combined
                    ngramOtherRemap[i++] = index;
                }
                else
                {
                    // If translated ngram is unique, add it to the end, remap points to it
                    ngramOtherRemap[i++] = combinedNGrams.Count;
                    combinedNGrams.Add(remap);
                }
            }

            // --- Chain links combining

            //	Other's unique chain links will not need to be touched
            //		Can tell if it's unique by testing whether ngram remap index >= original.length
            //		Remember that ngrams and the links are associated together despite being in seperate arrays (i.e. ngram[0] corresponds with links[0])
            //	For those which need to be comebined, use MarkovSegment combine method

            MarkovSegment[] combinedLinks = new MarkovSegment[combinedNGrams.Count];

            // Populate combined_links with own
            Parallel.For(0, combinedLinks.Length, (index) => {
                combinedLinks[index] = chainLinks[index];
            });

            // Populate linkmap with other
            // TODO: make parallel when done testing
            // Parallel.For(0, other.chain_links.Length, (index) => {
            for (int index = 0; index < other.chainLinks.Length; ++index)
            {
                var otherSegment = other.chainLinks[index];

                int remap;
                if ((remap = ngramOtherRemap[index]) >= chainLinks.Length)
                {
                    // Unique link needs to be associated with its remap spot
                    combinedLinks[remap] = otherSegment;
                }
                else
                {
                    var ownSegment = chainLinks[remap];
                    // Otherwise, combine the segments and replace
                    var replace = ownSegment.Combine(otherSegment, ngramOtherRemap, grams.Length);

                    // Replace link in relevant structures
                    combinedLinks[remap] = replace;
                }
            }
            // });

            // TODO: remove when done testing
            if (combinedLinks.Contains(null))
            {
                Console.WriteLine("yeah crazy");
            }

            // --- Seed combining

            //	Run the other's seeds through ngram remap,
            //	Any of other's seeds which are unique (larger than original seed's length), add to end

            List <int> combinedSeeds = new List <int>(seeds)
            {
                Capacity = seeds.Length + other.seeds.Length
            };

            combinedSeeds.AddRange(from oseed in other.seeds
                                   where ngramOtherRemap[oseed] >= seeds.Length
                                   select oseed);

            // Put it all together
            return(new MarkovStructure(combinedDictionary.ToArray(),
                                       combinedNGrams.ToArray(),
                                       combinedLinks,
                                       combinedSeeds.ToArray()));
        }