org.grobid.core.exceptions.GrobidException Java Examples

The following examples show how to use org.grobid.core.exceptions.GrobidException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NLPLeaderboardFigParser.java    From science-result-extractor with Apache License 2.0 6 votes vote down vote up
/**
     * The processing here is called from the full text parser in cascade.
     * Start and end position in the higher level tokenization are indicated in
     * the resulting Figure object.
     */
    public Figure processing(List<LayoutToken> tokenizationFigure, String featureVector) {

        String res;
        try {
            res = label(featureVector);
        } catch (Exception e) {
            throw new GrobidException("CRF labeling in ReferenceSegmenter fails.", e);
        }
        if (res == null) {
            return null;
        }
//        List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(res);

//		System.out.println(Joiner.on("\n").join(labeled));
//		System.out.println("----------------------");
//		System.out.println("----------------------");

//		return getExtractionResult(tokenizationFigure, labeled);
        return getExtractionResult(tokenizationFigure, res);
    }
 
Example #2
Source File: SenseTaggerTest.java    From grobid-ner with Apache License 2.0 6 votes vote down vote up
public void testSenseTagger() throws Exception {
    File textFile =
            new File(this.getResourceDir("./src/test/resources/").getAbsoluteFile() + "/test.en.txt");
    if (!textFile.exists()) {
        throw new GrobidException("Cannot start test, because test resource folder is not correctly set.");
    }
    String text = FileUtils.readFileToString(textFile);

    SenseTagger tagger = new SenseTagger();

    List<Sense> senses = tagger.extractSenses(text);
    if (senses != null) {
        for (Sense sense : senses) {
            System.out.print(text.substring(sense.getOffsetStart(), sense.getOffsetEnd()) + "\t");
            System.out.println(sense.toString());
        }
    } else {
        System.out.println("No sense found.");
    }
}
 
Example #3
Source File: NERParserCommon.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
/**
 * Process the content of the specified input file and format the result as training data.
 * <p>
 * Input file should be a text file. Each file is a paragraph entry that it's normally processed by NERD.
 */
static public StringBuilder createTraining(String inputFile,
                                           String outputPath,
                                           String fileName,
                                           NERParser parser,
                                           String lang,
                                           AbstractTokenizer tokenizer) throws Exception {
    File file = new File(inputFile);
    if (!file.exists()) {
        throw new GrobidException("Cannot create training data because input file can not be accessed: " + inputFile);
    }

    StringBuilder sb = new StringBuilder();
    if (inputFile.endsWith(".txt") || inputFile.endsWith(".TXT")) {
        sb.append(xmlHeader);

        // we use the name of the file as document ID, removing spaces, 
        // note that it could lead to non wellformed XML for weird file names
        sb.append("\t\t<document name=\"" + fileName.replace(" ", "_") + "\">\n");
        createTrainingText(file, parser, lang, tokenizer, sb);
        sb.append("\t\t</document>\n");

        sb.append(xmlEnd);
    }

    if (sb.length() > 0) {
        try {
            FileUtils.writeStringToFile(new File(outputPath), sb.toString());
        } catch (IOException e) {
            throw new GrobidException("Cannot create training data because output file can not be accessed: " + outputPath, e);
        }
    }
    return sb;
}
 
Example #4
Source File: NERLexicon.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
private NERLexicon() {

        String pathSenseDescriptions = GrobidProperties.getGrobidHomePath() + "/lexicon/senses/descriptions.txt";
        // read the Wordnet descriptions
        try {
            BufferedReader bufReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(pathSenseDescriptions)));
            String line;
            while ((line = bufReader.readLine()) != null) {
                if (line.trim().length() == 0) {
                    continue;
                }
                if (line.trim().startsWith("#")) {
                    // this is a comment
                    continue;
                }
                String[] parts = line.trim().split("\t");
                if (parts.length != 2) {
                    continue;
                }
                descriptions.put(parts[0], parts[1]);
            }
            bufReader.close();
        } catch (IOException e) {
            throw new GrobidException("Error reading word sense dfescriptions file.", e);
        }
    }
 
Example #5
Source File: SenseTaggerTest.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
public File getResourceDir(String resourceDir) {
    File file = new File(resourceDir);
    if (!file.exists()) {
        if (!file.mkdirs()) {
            throw new GrobidException("Cannot start test, because test resource folder is not correctly set.");
        }
    }
    return (file);
}
 
Example #6
Source File: NERParserTest.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
public File getResourceDir(String resourceDir) {
    File file = new File(resourceDir);
    if (!file.exists()) {
        if (!file.mkdirs()) {
            throw new GrobidException("Cannot start test, because test resource folder is not correctly set.");
        }
    }
    return (file);
}
 
Example #7
Source File: NerdRestService.java    From entity-fishing with Apache License 2.0 4 votes vote down vote up
@POST
@Path(DISAMBIGUATE)
@Consumes(MediaType.MULTIPART_FORM_DATA)
@Produces(MediaType.APPLICATION_JSON)
public Response processQueryJson(@FormDataParam(QUERY) String query,
                                 @FormDataParam(FILE) InputStream inputStream) {
    String json = null;
    Response response = null;

    try {
        if (inputStream != null) {
            json = nerdProcessFile.processQueryAndPdfFile(query, inputStream);
        } else {
            json = nerdProcessQuery.processQuery(query);
        }

        if (json == null) {
            response = Response.status(Response.Status.INTERNAL_SERVER_ERROR).build();
        } else {
            response = Response
                    .status(Response.Status.OK)
                    .entity(json)
                    .header(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_JSON + "; charset=UTF-8")
                    .header("Access-Control-Allow-Origin", "*")
                    .header("Access-Control-Allow-Methods", "GET, POST, DELETE, PUT")
                    .build();
        }

    } catch (QueryException qe) {
        return handleQueryException(qe, query);
    } catch (NoSuchElementException nseExp) {
        LOGGER.error("Could not get an engine from the pool within configured time. Sending service unavailable.");
        response = Response.status(Response.Status.SERVICE_UNAVAILABLE).build();
    } catch (GrobidException ge) {
        response = Response
                .status(Response.Status.SERVICE_UNAVAILABLE)
                .entity("The PDF cannot be processed by grobid. " + ge.getMessage())
                .build();
    } catch (Exception e) {
        LOGGER.error("An unexpected exception occurs. ", e);
        response = Response.status(Response.Status.INTERNAL_SERVER_ERROR).build();
    }

    return response;
}
 
Example #8
Source File: NERFrenchTrainer.java    From grobid-ner with Apache License 2.0 4 votes vote down vote up
/**
 * Add the selected features to a NER example set
 *
 * @param corpusDir          a path where corpus files are located
 * @param trainingOutputPath path where to store the temporary training data
 * @param evalOutputPath     path where to store the temporary evaluation data
 * @param splitRatio         ratio to consider for separating training and evaluation data, e.g. 0.8 for 80%
 * @return the total number of used corpus items
 */
@Override
public int createCRFPPData(final File corpusDir,
                           final File trainingOutputPath,
                           final File evalOutputPath,
                           double splitRatio) {
    int totalExamples = 0;
    try {
        System.out.println("sourcePathLabel: " + corpusDir);
        if (trainingOutputPath != null)
            System.out.println("outputPath for training data: " + trainingOutputPath);
        if (evalOutputPath != null)
            System.out.println("outputPath for evaluation data: " + evalOutputPath);

        // the file for writing the training data
        OutputStream os2 = null;
        Writer writer2 = null;
        if (trainingOutputPath != null) {
            os2 = new FileOutputStream(trainingOutputPath);
            writer2 = new OutputStreamWriter(os2, "UTF8");
        }

        // the file for writing the evaluation data
        OutputStream os3 = null;
        Writer writer3 = null;
        if (evalOutputPath != null) {
            os3 = new FileOutputStream(evalOutputPath);
            writer3 = new OutputStreamWriter(os3, "UTF8");
        }

        File corpusLeMondeDir = new File(leMondeCorpusPath);
        if (!corpusLeMondeDir.exists()) {
            LOGGER.warn("Directory does not exist: " + leMondeCorpusPath);
        }
        File[] files = corpusLeMondeDir.listFiles();
        if ((files == null) || (files.length == 0)) {
            LOGGER.warn("No files in directory: " + corpusLeMondeDir);
        }

        // process the core trainig set corresponding to LeMonde corpus first
        for (int i = 0; i < files.length; i++) {
            System.out.println(files[i].getName());
            if (files[i].getName().indexOf(".xml") != -1)
                totalExamples += processLeMonde(files[i], writer2, writer3, splitRatio);
        }

        if (writer2 != null) {
            writer2.close();
        }
        if (os2 != null) {
            os2.close();
        }

        if (writer3 != null) {
            writer3.close();
        }
        if (os3 != null) {
            os3.close();
        }
    } catch (Exception e) {
        throw new GrobidException("An exception occured while running Grobid.", e);
    }
    return totalExamples;
}
 
Example #9
Source File: NEREvaluation.java    From grobid-ner with Apache License 2.0 4 votes vote down vote up
/**
 * Evaluation based on the CoNLL-2003 shared task NER gold corpus, English set.
 * see http://www.cnts.ua.ac.be/conll2003/ner/.
 */
public String evaluate_reuters() {
    long start = System.currentTimeMillis();
    StringBuilder report = new StringBuilder();
    try {
        GrobidFactory.getInstance();
        NERParsers parsers = new NERParsers();

        File evalDataF = GrobidProperties.getInstance().getEvalCorpusPath(
                new File(new File("resources").getAbsolutePath()), model);

        File tmpEvalPath = getTempEvaluationDataPath();

        report.append("Eval. path: " + tmpEvalPath.getPath() + "\n");

        // There are three set that we can exploit testa, testb and the training sets.
        // However the training set should be used to reimforce the learning.
        File evalA = new File(conllPath + "/eng.testa");
        File evalB = new File(conllPath + "/eng.testb");
        File evalTrain = new File(conllPath + "/eng.train");

        if (!evalTrain.exists()) {
            throw new GrobidException(
                    "Cannot start evaluation, because corpus resource path for CoNLL file " +
                            " is not correctly set : " + evalDataF.getPath() + "/eng.train");
        }
        report.append(evaluate_reutersSet(parsers, evalTrain, tmpEvalPath));

        if (!evalA.exists()) {
            throw new GrobidException(
                    "Cannot start evaluation, because corpus resource path for CoNLL file " +
                            " is not correctly set : " + evalDataF.getPath() + "/eng.testa");
        }
        report.append(evaluate_reutersSet(parsers, evalA, tmpEvalPath));

        if (!evalB.exists()) {
            throw new GrobidException(
                    "Cannot start evaluation, because corpus resource path for CoNLL file " +
                            " is not correctly set : " + evalDataF.getPath() + "/eng.testb");
        }
        report.append(evaluate_reutersSet(parsers, evalB, tmpEvalPath));
    } catch (Exception e) {
        throw new GrobidException("An exception occured while running Grobid Reuters evaluation.", e);
    }
    long end = System.currentTimeMillis();
    report.append("processed in " + (end - start) / 1000 + " s.");

    return report.toString();
}
 
Example #10
Source File: NERTrainer.java    From grobid-ner with Apache License 2.0 4 votes vote down vote up
private int processReutersCorpus(Writer writerTraining, Writer writerEvaluation, double splitRatio) {
    int res = 0;
    try {
        File corpusDir = new File(reutersPath);
        System.out.println("Path to Reuters corpus: " + reutersPath);
        if (!corpusDir.exists()) {
            throw new GrobidException("Cannot start training, because corpus resource folder is not correctly set : "
                    + reutersPath);
        }

        File[] refFiles = corpusDir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return (name.endsWith(".zip"));
            }
        });

        System.out.println(refFiles.length + " reuters zip files");

        if (refFiles == null) {
            return 0;
        }
        for (File thefile : refFiles) {
            // we ignore the files in the core set
            if (coreSet.contains(thefile.getName().replace(".zip", ""))) {
                continue;
            }
            ZipFile zipFile = new ZipFile(thefile);
            System.out.println(thefile.getPath());
            Enumeration<? extends ZipEntry> entries = zipFile.entries();
            while (entries.hasMoreElements()) {
                ZipEntry entry = entries.nextElement();
                InputStream xmlStream = zipFile.getInputStream(entry);

                res += processReutersCorpus(xmlStream, entry, writerTraining, writerEvaluation, splitRatio);
                xmlStream.close();

                // as the number of files might be too important for development and debugging,
                // we introduce an optional limit
                if ((LIMIT != -1) && (res > LIMIT)) {
                    break;
                }
            }
            if ((GLOBAL_LIMIT != -1) && (res > GLOBAL_LIMIT)) {
                break;
            }
        }
    } catch (IOException ex) {
        throw new GrobidResourceException(
                "An exception occured when accessing/reading the Reuters corpus zip files.", ex);
    } finally {
    }
    return res;
}
 
Example #11
Source File: NERTrainer.java    From grobid-ner with Apache License 2.0 4 votes vote down vote up
private int processReutersCorpus(InputStream currentStream,
                                 ZipEntry entry,
                                 Writer writerTraining,
                                 Writer writerEvaluation,
                                 double splitRatio) {
    try {
        // try to open the corresponding semdoc file
        String fileName = entry.getName();
        System.out.println(fileName);
        File semdocFile =
                new File(idiliaPath + "/" + fileName.substring(0, 3) + "/" + fileName.replace(".xml", ".semdoc.xml"));
        if (!semdocFile.exists()) {
            throw new GrobidException("Cannot start training, because corpus resource folder for semdoc file " +
                    " is not correctly set : "
                    + idiliaPath + "/" + fileName.substring(0, 3) + "/" + fileName.replace(".xml", ".semdoc.xml"));
        }

        ReutersSaxHandler reutersSax = new ReutersSaxHandler();

        // get a factory
        SAXParserFactory spf = SAXParserFactory.newInstance();
        spf.setValidating(false);
        spf.setFeature("http://xml.org/sax/features/namespaces", false);
        spf.setFeature("http://xml.org/sax/features/validation", false);
        spf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);

        //get a new instance of parser
        SAXParser p = spf.newSAXParser();
        p.parse(currentStream, reutersSax);

        SemDocSaxHandler semdocSax = new SemDocSaxHandler(reutersSax.getTextVector());

        p = spf.newSAXParser();
        p.parse(semdocFile, semdocSax);

        Writer writer = null;
        if ((writerTraining == null) && (writerEvaluation != null))
            writer = writerEvaluation;
        if ((writerTraining != null) && (writerEvaluation == null))
            writer = writerTraining;
        else {
            if (Math.random() <= splitRatio)
                writer = writerTraining;
            else
                writer = writerEvaluation;
        }

        if (semdocSax.getAnnotatedTextVector() != null) {
            // to store unit term positions
            List<OffsetPosition> locationPositions = null;
            List<OffsetPosition> personTitlePositions = null;
            List<OffsetPosition> organisationPositions = null;
            List<OffsetPosition> orgFormPositions = null;
            List<String> labeled = new ArrayList<String>();
            // default value for named entity feature

            List<LayoutToken> tokens = new ArrayList<LayoutToken>();
            List<String> labels = new ArrayList<String>();
            for (String line : semdocSax.getAnnotatedTextVector()) {
                String[] pieces = line.split("\t");
                LayoutToken token = new LayoutToken(pieces[0]);
                String label = null;
                if (pieces.length == 2)
                    label = pieces[1];
                tokens.add(token);
                labels.add(label);

                if (line.trim().equals("@newline")) {

                    locationPositions = lexicon.tokenPositionsLocationNames(tokens);
                    personTitlePositions = lexicon.tokenPositionsPersonTitle(tokens);
                    organisationPositions = lexicon.tokenPositionsOrganisationNames(tokens);
                    orgFormPositions = lexicon.tokenPositionsOrgForm(tokens);

                    addFeatures(tokens, labels, writer,
                            locationPositions, personTitlePositions, organisationPositions, orgFormPositions);
                    writer.write("\n");


                }
            }
        }
    } catch (Exception e) {
        throw new GrobidException("An exception occured while running Grobid.", e);
    }
    return 1;
}