static void Main(string[] args) { // In the previous sections we have seen how to perform different kinds of classification // tasks: when there are only two classes (binary), when there are more than two classes // but each instance can only be assigned to one class (multi-class), and when there are // more than two classes but and instance can be simultaneously in many classes (multi-label). // Yet, all the previous classifiers we saw expect to receive one instance and simply produce // one class assignment for it (be it to one or multiple classes at the same time). However, // some classification problems might require us to tell not only which classes an instance // might belong to, but also tell how those classes relate to each other. // For example, consider the problem of classifying audio recordings of spoken sentences // according to which words them contain. In this case, since spoken sentences can contain // multiple words, we can define one class label after each possible word (restraining ourselves // only to words that can be found in an English dictionary), and then cast this problem as // a multi-label decision problem which could be solved by the classifiers we saw in the last // section. // However, consider now that we might interested not only in knowing which words a spoken // sentence contains, but also in _which order_ those words appear. In this case, we would // need our classifier to output more than a simple class assignment value or a fixed-length // vector indicating which words are in the sentence. We would need the classifier to output // an _structure_ that shows how each of those classes related with each other: this structure // can be simply a sequence (a chain, or list of class assignments where the class labels can // repeat), a tree, a graph, or even other more complex data structures. // One of the simplest examples of classifiers that can be used to perform structured classification // are Markov models. Given a sequence of observations (like words), these models can learn to classify // each word in a sentence by leveraging information about the _order_ in which those words typically // appear. They are then able to produce, for a single training instance (the sequence of words), a // list of class labels (a sequence of assignments) indicating which word is which. // We can start by loading the Chunking dataset for Part of Speech tagging: var chunking = new Chunking(); // http://www.cnts.ua.ac.be/conll2000/chunking/ // Learn a mapping between each word to an integer class label: var wordMap = new Codification().Learn(chunking.Words); // Learn a mapping between each tag to an integer class labels: var tagMap = new Codification().Learn(chunking.Tags); // Convert the training and testing sets into integer labels: int[][] trainX = wordMap.Transform(chunking.Training.Item1); int[][] testX = wordMap.Transform(chunking.Testing.Item1); // Convert the training and testing tags into integer labels: int[][] trainY = tagMap.Transform(chunking.Training.Item2); int[][] testY = tagMap.Transform(chunking.Testing.Item2); // Learn using a Markov model hmm(trainX, trainY, testX, testY); // Learn using a Conditional Random Field // crf(trainX, trainY, testX, testY); }
private void ScanFile(string path) { try { using (FileStream fs = File.OpenRead(path)) { foreach (RdcSignature sig in Chunking.Chunk(fs, _chunkBits, _minChunk, _maxChunk, _window)) { signatureBag.Add(sig); } } } finally { } }
public void chunking_dataset_markov() { Chunking chunking = new Chunking(path: Path.GetTempPath()); // Learn a mapping between each word to an integer class label: var wordMap = new Codification().Learn(chunking.Words); // Learn a mapping between each tag to an integer class labels: var tagMap = new Codification().Learn(chunking.Tags); // Convert the training and testing sets into integer labels: int[][] trainX = wordMap.Transform(chunking.Training.Item1); int[][] testX = wordMap.Transform(chunking.Testing.Item1); // Convert the training and testing tags into integer labels: int[][] trainY = tagMap.Transform(chunking.Training.Item2); int[][] testY = tagMap.Transform(chunking.Testing.Item2); // Learn one Markov model using the training data var teacher = new MaximumLikelihoodLearning() { UseLaplaceRule = true, UseWeights = true }; // Use the teacher to learn a Markov model var markov = teacher.Learn(trainX, trainY); // Use the model to predict instances: int[][] predY = markov.Decide(testX); // Check the accuracy of the model: var cm = new GeneralConfusionMatrix( predicted: predY.Concatenate(), expected: testY.Concatenate()); double acc = cm.Accuracy; #if NET35 Assert.AreEqual(0.51725520822339954d, acc, 1e-10); #else Assert.AreEqual(0.43987588914452158, acc, 1e-10); #endif }
public void chunking_dataset_crf() { Chunking chunking = new Chunking(path: Path.GetTempPath()); // Learn a mapping between each word to an integer class label: var wordMap = new Codification().Learn(chunking.Words); // Learn a mapping between each tag to an integer class labels: var tagMap = new Codification().Learn(chunking.Tags); // Convert the training and testing sets into integer labels: int[][] trainX = wordMap.Transform(chunking.Training.Item1); int[][] testX = wordMap.Transform(chunking.Testing.Item1); // Convert the training and testing tags into integer labels: int[][] trainY = tagMap.Transform(chunking.Training.Item2); int[][] testY = tagMap.Transform(chunking.Testing.Item2); int numberOfClasses = chunking.Tags.Length; int numberOfSymbols = chunking.Words.Length; // Learn one Markov model using the training data var teacher = new QuasiNewtonLearning <int>() { Function = new MarkovDiscreteFunction(states: numberOfClasses, symbols: numberOfSymbols) }; // Use the teacher to learn a Conditional Random Field model ConditionalRandomField <int> crf = teacher.Learn(trainX, trainY); // Use the crf to predict instances: int[][] predY = crf.Decide(testX); // Check the accuracy of the model: var cm = new ConfusionMatrix( predicted: predY.Concatenate(), expected: testY.Concatenate()); double acc = cm.Accuracy; Assert.AreEqual(0.99983114169322662, acc, 1e-10); }
public void TestFiles(string seedFile, string sourceFile) { List <RdcSignature> seedSigs; List <RdcSignature> sourceSigs; int chunkBits = 13; int minChunk = 512; int maxChunk = UInt16.MaxValue; int window = 100; Stopwatch stopwatch = new Stopwatch(); using (FileStream fs = File.OpenRead(seedFile)) { stopwatch.Start(); seedSigs = Chunking.Chunk(fs, chunkBits, minChunk, maxChunk, window); stopwatch.Stop(); long sigFileLength = 0; using (FileStream fs2 = File.OpenWrite(seedFile + ".sigs")) sigFileLength = SignatureSerializer.Serialize(fs2, seedSigs); Console.WriteLine("{4:D4} chunks in {0}ms. {1:D6} / {2:D10} ({3:p})" , stopwatch.ElapsedMilliseconds, sigFileLength, fs.Length, ((double)sigFileLength) / fs.Length, seedSigs.Count); } stopwatch.Reset(); using (FileStream fs = File.OpenRead(sourceFile)) { stopwatch.Start(); sourceSigs = Chunking.Chunk(fs, chunkBits, minChunk, maxChunk, window); stopwatch.Stop(); long sigFileLength = 0; using (FileStream fs2 = File.OpenWrite(sourceFile + ".sigs")) sigFileLength = SignatureSerializer.Serialize(fs2, sourceSigs); Console.WriteLine("{4:D4} chunks in {0}ms. {1:D6} / {2:D10} ({3:p})" , stopwatch.ElapsedMilliseconds, sigFileLength, fs.Length, ((double)sigFileLength) / fs.Length, sourceSigs.Count); } RdcSignatureComparer comparer = new RdcSignatureComparer(); //For each block in the sever file stopwatch.Restart(); Dictionary <int, List <RdcSignature?> > seedLookup = new Dictionary <int, List <RdcSignature?> >(); foreach (var sig in seedSigs) { if (!seedLookup.ContainsKey(sig.Length)) { seedLookup.Add(sig.Length, new List <RdcSignature?>()); } seedLookup[sig.Length].Add(sig); } List <RdcNeed> needs = new List <RdcNeed>(); ByteArrayComparer hashComparer = new ByteArrayComparer(); List <RdcSignature?> sigList = null; foreach (var mSig in sourceSigs) { //See if we have the server sig in our local cache if (seedLookup.TryGetValue(mSig.Length, out sigList)) { RdcSignature?matchedSig = sigList.FirstOrDefault(sig => comparer.Equals(mSig, sig.Value)); //We have it! if (matchedSig != null) { needs.Add(new RdcNeed() { blockType = RdcNeedType.Seed, length = matchedSig.Value.Length, offset = matchedSig.Value.Offset }); } else //don't - need it from server { needs.Add(new RdcNeed() { blockType = RdcNeedType.Source, length = mSig.Length, offset = mSig.Offset }); } } else // don't - need it from server { needs.Add(new RdcNeed() { blockType = RdcNeedType.Source, length = mSig.Length, offset = mSig.Offset }); } } stopwatch.Stop(); int haveCount = needs.Count(n => n.blockType == RdcNeedType.Seed); int haveSize = needs.Where(n => n.blockType == RdcNeedType.Seed).Sum(n => n.length); int needCount = needs.Count(n => n.blockType == RdcNeedType.Source); int needSize = needs.Where(n => n.blockType == RdcNeedType.Source).Sum(n => n.length); int totalSize = needs.Sum(n => n.length); double savings = ((double)haveSize) / ((double)totalSize); double chuckSizeAvg = sourceSigs.Average(s => s.Length); double chunkSizeMissing = needs.Where(n => n.blockType == RdcNeedType.Source).Average(s => s.length); Console.WriteLine("Calculated needs in {0}ms", stopwatch.ElapsedMilliseconds); Console.WriteLine("Need {0} of {1} chunks, or {2:N0} of {3:N0} bytes. ({4:P2} savings, {6} chunks, {5:N0} bytes).", needCount, needs.Count, needSize, totalSize, savings, haveSize, needs.Count - needCount); Console.WriteLine("Average: {0:N0} (source) / {1:N0} (missing) bytes per chunk", chuckSizeAvg, chunkSizeMissing); }
//Apply to one disease only public DiseaseData GetPredictionDataCountFromPublicationsOfOneDisease(List <Publication> publications, Disease disease) { DiseaseData PredictionData = new DiseaseData(disease, new RelatedEntities( type.Symptom, new List <RelatedEntity>() ) ); List <RelatedEntity> relatedEntities = PredictionData.RelatedEntities.RelatedEntitiesList; List <System.String> texts = new List <System.String>(); foreach (Publication publication in publications) { stringBuilder.Clear(); stringBuilder.Append(publication.title); stringBuilder.Append(" "); stringBuilder.Append(publication.abstractText); stringBuilder.Append(" "); stringBuilder.Append(publication.fullText); string text = stringBuilder.ToString(); //Text preprocessing text = text.ToLower(); //NAMED ENTITY RECOGNITION Chunking chunking = chunker.chunk(text); CharSequence cs = chunking.charSequence(); Set chunkSet = chunking.chunkSet(); Iterator iterator = chunkSet.iterator(); while (iterator.hasNext()) { Chunk chunk = (Chunk)iterator.next(); int start = chunk.start(); int end = chunk.end(); string str = cs.subSequence(start, end).toString(); int index = relatedEntities.FindIndex(symptom => symptom.Name.Equals(str) || symptom.Synonyms.IndexOf(str) != -1); if (index != -1) { //relatedEntities[index].Weight++; relatedEntities[index].TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value++; } else { //Find infos from phenotypes lists Symptom symptomFromPhetotypes = symptomsList.Where(x => x.Name.Equals(str) || x.Synonyms.IndexOf(str) != -1).FirstOrDefault(); //Add the real Symptom if it exists if (symptomFromPhetotypes != null) { RelatedEntity myRealEntity = new RelatedEntity( type.Symptom, symptomFromPhetotypes.Name, 1.0, symptomFromPhetotypes.Synonyms ); myRealEntity.TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value = 1.0; relatedEntities.Add(myRealEntity); } } } } /* * //Sort and Take only a the best symptoms (see config file) * PredictionData.RelatedEntities.RelatedEntitiesList = * PredictionData.RelatedEntities.RelatedEntitiesList * .OrderByDescending(x => x.TermFrequencies.Where(tf => tf.TFType == TFType.RawCount).FirstOrDefault().Value) * .Take(ConfigurationManager.Instance.config.MaxNumberSymptoms) * .ToList(); */ /* * ///TEEEEEEEEEEEST * extractedSymptoms = new List<Symptom>(); * for (int k = 0; k < 42; k++) * { * Symptom symptom = new Symptom(); * symptom.Name = "Paul"; * symptom.OrphaNumber = "caca"; * symptom.Weight = 42; * extractedSymptoms.Add(symptom); * }*/ return(PredictionData); }
//Apply to one disease only public DiseaseData GetPredictionDataFromPublicationsOfOneDisease(List <Publication> publications, Disease disease) { DiseaseData PredictionData = new DiseaseData(disease, new RelatedEntities( type.Symptom, new List <RelatedEntity>() ) ); List <RelatedEntity> relatedEntities = PredictionData.RelatedEntities.RelatedEntitiesList; List <System.String> texts = new List <System.String>(); foreach (Publication publication in publications) { string text = publication.title + " " + publication.abstractText + " " + publication.fullText; //Text preprocessing text = text.ToLower(); //NAMED ENTITY RECOGNITION Chunking chunking = chunkerHMM.chunk(text); CharSequence cs = chunking.charSequence(); Set chunkSet = chunking.chunkSet(); Iterator iterator = chunkSet.iterator(); while (iterator.hasNext()) { Chunk chunk = (Chunk)iterator.next(); int start = chunk.start(); int end = chunk.end(); string str = cs.subSequence(start, end).toString(); int index = relatedEntities.FindIndex(symptom => symptom.Name.Equals(str) || symptom.Synonyms.IndexOf(str) != -1); if (index != -1) { relatedEntities[index].Weight++; } else { //Find infos from phenotypes lists Symptom symptomFromPhetotypes = symptomsList.Where(x => x.Name.Equals(str) || x.Synonyms.IndexOf(str) != -1).FirstOrDefault(); //Add the real Symptom relatedEntities.Add( new RelatedEntity( type.Symptom, symptomFromPhetotypes.Name, 1.0, symptomFromPhetotypes.Synonyms ) ); } } } //Symptom Weight Normalization from 0 to 100 for (int i = 0; i < relatedEntities.Count; i++) { //Find Min and Max for Normalization double max = relatedEntities.Max(x => x.Weight); double min = relatedEntities.Min(x => x.Weight); //Normalization if (max == min)//If size==1 { if (relatedEntities[i].Weight > 100.0) { relatedEntities[i].Weight = 100.0; } } else { relatedEntities[i].Weight = 100 * (relatedEntities[i].Weight - min) / (max - min); } } //Sort related entities by descending weight PredictionData.RelatedEntities.RelatedEntitiesList.OrderByDescending(x => x.Weight).ToList(); //Take only a the best symptoms (see config file) PredictionData.RelatedEntities.RelatedEntitiesList = PredictionData.RelatedEntities.RelatedEntitiesList .OrderByDescending(x => x.Weight) .Take(ConfigurationManager.Instance.config.MaxNumberSymptoms) .ToList(); /* * ///TEEEEEEEEEEEST * extractedSymptoms = new List<Symptom>(); * for (int k = 0; k < 42; k++) * { * Symptom symptom = new Symptom(); * symptom.Name = "Paul"; * symptom.OrphaNumber = "caca"; * symptom.Weight = 42; * extractedSymptoms.Add(symptom); * }*/ return(PredictionData); }