Example #1
0
        /// <summary>
        /// Generates an entities XML from NER output.
        /// </summary>
        /// <param name="options">Options.</param>
        public override void Run()
        {
            if (options.Verbose)
            {
                Console.Error.WriteLine("Option 1.");
            }

            WriteResult(NER.GenerateEntitiesToString(options.InputFile, options.Language));
        }
        /// <summary>
        /// Generates the dictionary of entities found.
        /// </summary>
        /// <param name="options">Options.</param>
        public override void Run()
        {
            if (options.Verbose)
            {
                Console.Error.WriteLine("Dictionary generation command");
            }

            string xml = NER.GenerateEntitiesToString(options.InputFile, options.Language);
            string csv = CSVUtils.RemoveDuplicates(CSVUtils.EntitiesToCsv(xml, options.Separator));

            WriteResult(csv);
        }
Example #3
0
 public void unload()
 {
     WIN.Dispose();
     NER.Dispose();
     backGround.Dispose();
 }
        public async Task <ScanResult> ScanNext(Repository repo, IteratorItem item)
        {
            ScanResult retVal    = null;
            string     extention = "";

            Log.Trace("Started processing: " + item.DataObjectIdentifier + ", Thread Id: " + Thread.CurrentThread.ManagedThreadId);

            try
            {
                if (item == null)
                {
                    return(null);
                }

                Dictionary <string, List <string> > identifiers = null;
                FileInfo file = new FileInfo(item.DataObjectIdentifier);
                extention = file.Extension;

                // Don't scan files that are too big
                if (file.Length > _maxFileSize * 1000000)
                {
                    Log.Warn("File too big:" + file.FullName + ", Size:" + file.Length);
                    Counter.Add("file_too_big", 1);
                }
                else
                {
                    // Get text of the file
                    string txt = await FileUtils.Parse(file);

                    if (txt != null && txt.Length > 0)
                    {
                        //Do NER
                        identifiers = await NER.Parse(txt, file.FullName);

                        if (identifiers != null && identifiers.Count > 0)
                        {
                            Counter.Add("found_pi", 1);

                            // Get metadata
                            item = await CollectFileMetadata(item, file);

                            retVal = new ScanResult()
                            {
                                DataObjectIdentifier = item.DataObjectIdentifier, Identifiers = identifiers, Metadata = item.Metadata, RepositoryId = repo.id
                            };

                            // Store results
                            await SMBDal.AddDataObjectForMatching(retVal);
                        }
                        else
                        {
                            Counter.Add("didnot_find_pi", 1);
                        }
                    }
                    Log.Info("Processed file:" + file.FullName + ", Identifiers: " + (identifiers == null ? "0" : identifiers.Count.ToString()));
                }
            }
            catch (Exception ex)
            {
                Log.Error(ex, "Failed to process file: " + item.DataObjectIdentifier);
                return(null);
            }

            Log.Trace("Finished processing: " + retVal.DataObjectIdentifier + ", Thread Id: " + Thread.CurrentThread.ManagedThreadId);
            Counter.Add(extention, 1);

            return(retVal);
        }
Example #5
0
        public static List <int> HasNERTag(NER tag)
        {
            //Syntactic sugar for the B-trees in the Extractor class.
            List <int> result = new List <int>();

            switch (tag)
            {
            case NER.ORG:
                if (IndexEngine.NLP.Extractor.OrgList.Keys.Count != 0)
                {
                    foreach (var temp in Extractor.OrgList.Keys.ToList())
                    {
                        result.Add(temp);
                    }
                    return(result);
                }
                else
                {
                    result.Add(-1);
                    return(result);
                    //
                }

            case NER.LOC:
                if (Extractor.LocList.Keys.Count != 0)
                {
                    foreach (var temp in Extractor.LocList.Keys.ToList())
                    {
                        result.Add(temp);
                    }
                    return(result);
                }
                else
                {
                    result.Add(-1);
                    return(result);
                    //
                }

            case NER.TIME:
                if (Extractor.TimeList.Keys.Count != 0)
                {
                    foreach (var temp in Extractor.TimeList.Keys.ToList())
                    {
                    }
                    return(result);
                }
                else
                {
                    result.Add(-1);
                    return(result);
                    //
                }

            case NER.URL:
                if (Extractor.URLList.Keys.Count != 0)
                {
                    return(Extractor.URLList.Keys.ToList());
                }
                else
                {
                    result.Add(-1);
                    return(result);
                    //
                }

            case NER.DATE:
                if (Extractor.DateList.Keys.Count != 0)
                {
                    return(Extractor.DateList.Keys.ToList());
                }
                else
                {
                    result.Add(-1);
                    return(result);
                }
            }
            throw new ArgumentException();
        }