Exemplo n.º 1
0
        static void Main(string[] args)
        {
            // In the previous sections we have seen how to perform different kinds of classification
            // tasks: when there are only two classes (binary), when there are more than two classes
            // but each instance can only be assigned to one class (multi-class), and when there are
            // more than two classes but and instance can be simultaneously in many classes (multi-label).

            // Yet, all the previous classifiers we saw expect to receive one instance and simply produce
            // one class assignment for it (be it to one or multiple classes at the same time). However,
            // some classification problems might require us to tell not only which classes an instance
            // might belong to, but also tell how those classes relate to each other.

            // For example, consider the problem of classifying audio recordings of spoken sentences
            // according to which words them contain. In this case, since spoken sentences can contain
            // multiple words, we can define one class label after each possible word (restraining ourselves
            // only to words that can be found in an English dictionary), and then cast this problem as
            // a multi-label decision problem which could be solved by the classifiers we saw in the last
            // section.

            // However, consider now that we might interested not only in knowing which words a spoken
            // sentence contains, but also in _which order_ those words appear. In this case, we would
            // need our classifier to output more than a simple class assignment value or a fixed-length
            // vector indicating which words are in the sentence. We would need the classifier to output
            // an _structure_ that shows how each of those classes related with each other: this structure
            // can be simply a sequence (a chain, or list of class assignments where the class labels can
            // repeat), a tree, a graph, or even other more complex data structures.

            // One of the simplest examples of classifiers that can be used to perform structured classification
            // are Markov models. Given a sequence of observations (like words), these models can learn to classify
            // each word in a sentence by leveraging information about the _order_ in which those words typically
            // appear. They are then able to produce, for a single training instance (the sequence of words), a
            // list of class labels (a sequence of assignments) indicating which word is which.

            // We can start by loading the Chunking dataset for Part of Speech tagging:
            var chunking = new Chunking(); // http://www.cnts.ua.ac.be/conll2000/chunking/

            // Learn a mapping between each word to an integer class label:
            var wordMap = new Codification().Learn(chunking.Words);

            // Learn a mapping between each tag to an integer class labels:
            var tagMap = new Codification().Learn(chunking.Tags);

            // Convert the training and testing sets into integer labels:
            int[][] trainX = wordMap.Transform(chunking.Training.Item1);
            int[][] testX  = wordMap.Transform(chunking.Testing.Item1);

            // Convert the training and testing tags into integer labels:
            int[][] trainY = tagMap.Transform(chunking.Training.Item2);
            int[][] testY  = tagMap.Transform(chunking.Testing.Item2);

            // Learn using a Markov model
            hmm(trainX, trainY, testX, testY);

            // Learn using a Conditional Random Field
            // crf(trainX, trainY, testX, testY);
        }
Exemplo n.º 2
0
 private void ScanFile(string path)
 {
     try
     {
         using (FileStream fs = File.OpenRead(path))
         {
             foreach (RdcSignature sig in Chunking.Chunk(fs, _chunkBits, _minChunk, _maxChunk, _window))
             {
                 signatureBag.Add(sig);
             }
         }
     }
     finally { }
 }
Exemplo n.º 3
0
        public void chunking_dataset_markov()
        {
            Chunking chunking = new Chunking(path: Path.GetTempPath());

            // Learn a mapping between each word to an integer class label:
            var wordMap = new Codification().Learn(chunking.Words);

            // Learn a mapping between each tag to an integer class labels:
            var tagMap = new Codification().Learn(chunking.Tags);

            // Convert the training and testing sets into integer labels:
            int[][] trainX = wordMap.Transform(chunking.Training.Item1);
            int[][] testX  = wordMap.Transform(chunking.Testing.Item1);

            // Convert the training and testing tags into integer labels:
            int[][] trainY = tagMap.Transform(chunking.Training.Item2);
            int[][] testY  = tagMap.Transform(chunking.Testing.Item2);


            // Learn one Markov model using the training data
            var teacher = new MaximumLikelihoodLearning()
            {
                UseLaplaceRule = true,
                UseWeights     = true
            };

            // Use the teacher to learn a Markov model
            var markov = teacher.Learn(trainX, trainY);

            // Use the model to predict instances:
            int[][] predY = markov.Decide(testX);

            // Check the accuracy of the model:
            var cm = new GeneralConfusionMatrix(
                predicted: predY.Concatenate(),
                expected: testY.Concatenate());

            double acc = cm.Accuracy;

#if NET35
            Assert.AreEqual(0.51725520822339954d, acc, 1e-10);
#else
            Assert.AreEqual(0.43987588914452158, acc, 1e-10);
#endif
        }
Exemplo n.º 4
0
        public void chunking_dataset_crf()
        {
            Chunking chunking = new Chunking(path: Path.GetTempPath());

            // Learn a mapping between each word to an integer class label:
            var wordMap = new Codification().Learn(chunking.Words);

            // Learn a mapping between each tag to an integer class labels:
            var tagMap = new Codification().Learn(chunking.Tags);

            // Convert the training and testing sets into integer labels:
            int[][] trainX = wordMap.Transform(chunking.Training.Item1);
            int[][] testX  = wordMap.Transform(chunking.Testing.Item1);

            // Convert the training and testing tags into integer labels:
            int[][] trainY = tagMap.Transform(chunking.Training.Item2);
            int[][] testY  = tagMap.Transform(chunking.Testing.Item2);


            int numberOfClasses = chunking.Tags.Length;
            int numberOfSymbols = chunking.Words.Length;

            // Learn one Markov model using the training data
            var teacher = new QuasiNewtonLearning <int>()
            {
                Function = new MarkovDiscreteFunction(states: numberOfClasses, symbols: numberOfSymbols)
            };

            // Use the teacher to learn a Conditional Random Field model
            ConditionalRandomField <int> crf = teacher.Learn(trainX, trainY);

            // Use the crf to predict instances:
            int[][] predY = crf.Decide(testX);

            // Check the accuracy of the model:
            var cm = new ConfusionMatrix(
                predicted: predY.Concatenate(),
                expected: testY.Concatenate());

            double acc = cm.Accuracy;

            Assert.AreEqual(0.99983114169322662, acc, 1e-10);
        }
Exemplo n.º 5
0
        public void TestFiles(string seedFile, string sourceFile)
        {
            List <RdcSignature> seedSigs;
            List <RdcSignature> sourceSigs;

            int chunkBits = 13;
            int minChunk  = 512;
            int maxChunk  = UInt16.MaxValue;
            int window    = 100;

            Stopwatch stopwatch = new Stopwatch();

            using (FileStream fs = File.OpenRead(seedFile))
            {
                stopwatch.Start();
                seedSigs = Chunking.Chunk(fs, chunkBits, minChunk, maxChunk, window);
                stopwatch.Stop();
                long sigFileLength = 0;
                using (FileStream fs2 = File.OpenWrite(seedFile + ".sigs"))
                    sigFileLength = SignatureSerializer.Serialize(fs2, seedSigs);
                Console.WriteLine("{4:D4} chunks in {0}ms.  {1:D6} / {2:D10} ({3:p})"
                                  , stopwatch.ElapsedMilliseconds, sigFileLength, fs.Length, ((double)sigFileLength) / fs.Length, seedSigs.Count);
            }
            stopwatch.Reset();

            using (FileStream fs = File.OpenRead(sourceFile))
            {
                stopwatch.Start();
                sourceSigs = Chunking.Chunk(fs, chunkBits, minChunk, maxChunk, window);
                stopwatch.Stop();
                long sigFileLength = 0;
                using (FileStream fs2 = File.OpenWrite(sourceFile + ".sigs"))
                    sigFileLength = SignatureSerializer.Serialize(fs2, sourceSigs);
                Console.WriteLine("{4:D4} chunks  in {0}ms.  {1:D6} / {2:D10} ({3:p})"
                                  , stopwatch.ElapsedMilliseconds, sigFileLength, fs.Length, ((double)sigFileLength) / fs.Length, sourceSigs.Count);
            }

            RdcSignatureComparer comparer = new RdcSignatureComparer();

            //For each block in the sever file
            stopwatch.Restart();
            Dictionary <int, List <RdcSignature?> > seedLookup = new Dictionary <int, List <RdcSignature?> >();

            foreach (var sig in seedSigs)
            {
                if (!seedLookup.ContainsKey(sig.Length))
                {
                    seedLookup.Add(sig.Length, new List <RdcSignature?>());
                }
                seedLookup[sig.Length].Add(sig);
            }

            List <RdcNeed>       needs        = new List <RdcNeed>();
            ByteArrayComparer    hashComparer = new ByteArrayComparer();
            List <RdcSignature?> sigList      = null;

            foreach (var mSig in sourceSigs)
            {
                //See if we have the server sig in our local cache
                if (seedLookup.TryGetValue(mSig.Length, out sigList))
                {
                    RdcSignature?matchedSig = sigList.FirstOrDefault(sig => comparer.Equals(mSig, sig.Value));
                    //We have it!
                    if (matchedSig != null)
                    {
                        needs.Add(new RdcNeed()
                        {
                            blockType = RdcNeedType.Seed, length = matchedSig.Value.Length, offset = matchedSig.Value.Offset
                        });
                    }
                    else //don't - need it from server
                    {
                        needs.Add(new RdcNeed()
                        {
                            blockType = RdcNeedType.Source, length = mSig.Length, offset = mSig.Offset
                        });
                    }
                }
                else // don't - need it from server
                {
                    needs.Add(new RdcNeed()
                    {
                        blockType = RdcNeedType.Source, length = mSig.Length, offset = mSig.Offset
                    });
                }
            }
            stopwatch.Stop();

            int    haveCount = needs.Count(n => n.blockType == RdcNeedType.Seed);
            int    haveSize  = needs.Where(n => n.blockType == RdcNeedType.Seed).Sum(n => n.length);
            int    needCount = needs.Count(n => n.blockType == RdcNeedType.Source);
            int    needSize  = needs.Where(n => n.blockType == RdcNeedType.Source).Sum(n => n.length);
            int    totalSize = needs.Sum(n => n.length);
            double savings   = ((double)haveSize) / ((double)totalSize);

            double chuckSizeAvg     = sourceSigs.Average(s => s.Length);
            double chunkSizeMissing = needs.Where(n => n.blockType == RdcNeedType.Source).Average(s => s.length);

            Console.WriteLine("Calculated needs in {0}ms", stopwatch.ElapsedMilliseconds);
            Console.WriteLine("Need {0} of {1} chunks, or {2:N0} of {3:N0} bytes. ({4:P2} savings, {6} chunks, {5:N0} bytes).", needCount, needs.Count, needSize, totalSize, savings, haveSize, needs.Count - needCount);
            Console.WriteLine("Average: {0:N0} (source) / {1:N0} (missing) bytes per chunk", chuckSizeAvg, chunkSizeMissing);
        }
Exemplo n.º 6
0
        //Apply to one disease only
        public DiseaseData GetPredictionDataCountFromPublicationsOfOneDisease(List <Publication> publications, Disease disease)
        {
            DiseaseData PredictionData = new DiseaseData(disease,
                                                         new RelatedEntities(
                                                             type.Symptom,
                                                             new List <RelatedEntity>()
                                                             )
                                                         );
            List <RelatedEntity> relatedEntities = PredictionData.RelatedEntities.RelatedEntitiesList;



            List <System.String> texts = new List <System.String>();

            foreach (Publication publication in publications)
            {
                stringBuilder.Clear();
                stringBuilder.Append(publication.title);
                stringBuilder.Append(" ");
                stringBuilder.Append(publication.abstractText);
                stringBuilder.Append(" ");
                stringBuilder.Append(publication.fullText);

                string text = stringBuilder.ToString();

                //Text preprocessing
                text = text.ToLower();

                //NAMED ENTITY RECOGNITION
                Chunking     chunking = chunker.chunk(text);
                CharSequence cs       = chunking.charSequence();
                Set          chunkSet = chunking.chunkSet();
                Iterator     iterator = chunkSet.iterator();
                while (iterator.hasNext())
                {
                    Chunk  chunk = (Chunk)iterator.next();
                    int    start = chunk.start();
                    int    end   = chunk.end();
                    string str   = cs.subSequence(start, end).toString();

                    int index = relatedEntities.FindIndex(symptom => symptom.Name.Equals(str) || symptom.Synonyms.IndexOf(str) != -1);
                    if (index != -1)
                    {
                        //relatedEntities[index].Weight++;
                        relatedEntities[index].TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value++;
                    }
                    else
                    {
                        //Find infos from phenotypes lists
                        Symptom symptomFromPhetotypes = symptomsList.Where(x => x.Name.Equals(str) || x.Synonyms.IndexOf(str) != -1).FirstOrDefault();

                        //Add the real Symptom if it exists
                        if (symptomFromPhetotypes != null)
                        {
                            RelatedEntity myRealEntity = new RelatedEntity(
                                type.Symptom,
                                symptomFromPhetotypes.Name,
                                1.0,
                                symptomFromPhetotypes.Synonyms
                                );
                            myRealEntity.TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value = 1.0;
                            relatedEntities.Add(myRealEntity);
                        }
                    }
                }
            }

            /*
             * //Sort and Take only a the best symptoms (see config file)
             * PredictionData.RelatedEntities.RelatedEntitiesList =
             *  PredictionData.RelatedEntities.RelatedEntitiesList
             *  .OrderByDescending(x => x.TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value)
             *  .Take(ConfigurationManager.Instance.config.MaxNumberSymptoms)
             *  .ToList();
             */

            /*
             * ///TEEEEEEEEEEEST
             * extractedSymptoms = new List<Symptom>();
             * for (int k = 0; k < 42; k++)
             * {
             *  Symptom symptom = new Symptom();
             *  symptom.Name = "Paul";
             *  symptom.OrphaNumber = "caca";
             *  symptom.Weight = 42;
             *  extractedSymptoms.Add(symptom);
             * }*/

            return(PredictionData);
        }
Exemplo n.º 7
0
        //Apply to one disease only
        public DiseaseData GetPredictionDataFromPublicationsOfOneDisease(List <Publication> publications, Disease disease)
        {
            DiseaseData PredictionData = new DiseaseData(disease,
                                                         new RelatedEntities(
                                                             type.Symptom,
                                                             new List <RelatedEntity>()
                                                             )
                                                         );
            List <RelatedEntity> relatedEntities = PredictionData.RelatedEntities.RelatedEntitiesList;



            List <System.String> texts = new List <System.String>();

            foreach (Publication publication in publications)
            {
                string text = publication.title + " " + publication.abstractText + " " + publication.fullText;

                //Text preprocessing
                text = text.ToLower();

                //NAMED ENTITY RECOGNITION
                Chunking     chunking = chunkerHMM.chunk(text);
                CharSequence cs       = chunking.charSequence();
                Set          chunkSet = chunking.chunkSet();
                Iterator     iterator = chunkSet.iterator();
                while (iterator.hasNext())
                {
                    Chunk  chunk = (Chunk)iterator.next();
                    int    start = chunk.start();
                    int    end   = chunk.end();
                    string str   = cs.subSequence(start, end).toString();

                    int index = relatedEntities.FindIndex(symptom => symptom.Name.Equals(str) || symptom.Synonyms.IndexOf(str) != -1);
                    if (index != -1)
                    {
                        relatedEntities[index].Weight++;
                    }
                    else
                    {
                        //Find infos from phenotypes lists
                        Symptom symptomFromPhetotypes = symptomsList.Where(x => x.Name.Equals(str) || x.Synonyms.IndexOf(str) != -1).FirstOrDefault();

                        //Add the real Symptom
                        relatedEntities.Add(
                            new RelatedEntity(
                                type.Symptom,
                                symptomFromPhetotypes.Name,
                                1.0,
                                symptomFromPhetotypes.Synonyms
                                )
                            );
                    }
                }
            }

            //Symptom Weight Normalization from 0 to 100
            for (int i = 0; i < relatedEntities.Count; i++)
            {
                //Find Min and Max for Normalization
                double max = relatedEntities.Max(x => x.Weight);
                double min = relatedEntities.Min(x => x.Weight);

                //Normalization
                if (max == min)//If size==1
                {
                    if (relatedEntities[i].Weight > 100.0)
                    {
                        relatedEntities[i].Weight = 100.0;
                    }
                }
                else
                {
                    relatedEntities[i].Weight = 100 * (relatedEntities[i].Weight - min) / (max - min);
                }
            }

            //Sort related entities by descending weight
            PredictionData.RelatedEntities.RelatedEntitiesList.OrderByDescending(x => x.Weight).ToList();
            //Take only a the best symptoms (see config file)
            PredictionData.RelatedEntities.RelatedEntitiesList =
                PredictionData.RelatedEntities.RelatedEntitiesList
                .OrderByDescending(x => x.Weight)
                .Take(ConfigurationManager.Instance.config.MaxNumberSymptoms)
                .ToList();

            /*
             * ///TEEEEEEEEEEEST
             * extractedSymptoms = new List<Symptom>();
             * for (int k = 0; k < 42; k++)
             * {
             *  Symptom symptom = new Symptom();
             *  symptom.Name = "Paul";
             *  symptom.OrphaNumber = "caca";
             *  symptom.Weight = 42;
             *  extractedSymptoms.Add(symptom);
             * }*/

            return(PredictionData);
        }