コード例 #1
0
        public static IDictionary <string, DataInstance> ParseColumnFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix)
        {
            CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
            Properties         props = new Properties();
            SeqClassifierFlags flags = new SeqClassifierFlags(props);

            flags.entitySubclassification       = "noprefix";
            flags.retainEntitySubclassification = false;
            conllreader.Init(flags);
            IEnumerator <IList <CoreLabel> > dociter = conllreader.GetIterator(reader);
            int num = -1;
            IDictionary <string, DataInstance> sents = new Dictionary <string, DataInstance>();

            while (dociter.MoveNext())
            {
                IList <CoreLabel> doc      = dociter.Current;
                IList <string>    words    = new List <string>();
                IList <CoreLabel> sentcore = new List <CoreLabel>();
                int tokenindex             = 0;
                foreach (CoreLabel l in doc)
                {
                    if (l.Word().Equals(CoNLLDocumentReaderAndWriter.Boundary) || l.Word().Equals("-DOCSTART-"))
                    {
                        if (words.Count > 0)
                        {
                            num++;
                            string       docid    = sentIDprefix + "-" + num.ToString();
                            DataInstance sentInst = DataInstance.GetNewSurfaceInstance(sentcore);
                            sents[docid] = sentInst;
                            words        = new List <string>();
                            sentcore     = new List <CoreLabel>();
                            tokenindex   = 0;
                        }
                        continue;
                    }
                    tokenindex++;
                    words.Add(l.Word());
                    l.Set(typeof(CoreAnnotations.IndexAnnotation), tokenindex);
                    l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word());
                    string label = l.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    System.Diagnostics.Debug.Assert(label != null, "label cannot be null");
                    l.Set(typeof(CoreAnnotations.TextAnnotation), l.Word());
                    l.Set(typeof(CoreAnnotations.OriginalTextAnnotation), l.Word());
                    if (setGoldClass)
                    {
                        l.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
                    }
                    if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label))
                    {
                        l.Set(setClassForTheseLabels[label], label);
                    }
                    sentcore.Add(l);
                }
                if (words.Count > 0)
                {
                    num++;
                    string       docid    = sentIDprefix + "-" + num.ToString();
                    DataInstance sentInst = DataInstance.GetNewSurfaceInstance(sentcore);
                    sents[docid] = sentInst;
                }
            }
            return(sents);
        }