c++ source code of full

jumanpp-master
- src
  - jumandic
    - main
      - jumanpp.cc
      - bootstrap.cc
      - path_diff.cc
      - jumanpp_train.h
      - jumanpp_train.cc
      - jumanpp.h
    - proto
      - juman.proto
      - jumanpp.proto
    - CMakeLists.txt
    - tests
      - bug_28_lattice.cc
      - bug_950111-003_test.cc
      - partial_data_train.cc
      - unk_node_match_test.cc
    - shared
      - jumanpp_args.cc
      - mdic_format.cc
      - jumanpp_pb_format.h
      - jumanpp_pb_format.cc
      - jumandic_env.h
      - juman_pb_format.cc
      - training_test.cc
      - jumandic_test_env.h
      - lattice_format.h
      - juman_format.h
      - mini_dic_test.cc
      - mdic_format.h
      - morph_format.cc
      - jumandic_spec.cc
      - jumandic_env.cc
      - mdic_format_test.cc
      - jumandic_ids.h
      - juman_pb_format.h
      - jumanpp_args.h
      - lattice_format.cc
      - jumandic_spec.h
      - jumandic_spec_test.cc
      - subset_format.cc
      - jumandic_debug_dump_test.cc
      - jumandic_id_resolver.h
      - subset_format.h
      - jumandic_codegen_test.cc
      - juman_format.cc
      - jumandic_id_resolver.cc
      - jumandic_spec_lexdata.cc
      - morph_format.h
      - jumandic_ids.cc
  - util
    - inlined_vector_test.cc
    - format.cc
    - serialization_test.cc
    - characters_test.cpp
    - status.cpp
    - string_piece.cc
    - array_slice_internal.h
    - parse_utils.cc
    - quantized_weights.h
    - debug_output.h
    - fast_hash_test.cc
    - types.hpp
    - assert.h
    - characters.h
    - csv_reader.cc
    - codegen.cc
    - cfg.h
    - serialization_flatmap.h
    - common.hpp
    - mmap_test.cc
    - printer_test.cc
    - bounded_queue.h
    - array_slice_test.cc
    - coded_io.h
    - coded_io_test.cc
    - lazy.h
    - serialization.h
    - flatrep.h
    - hashing_test.cc
    - coded_io.cc
    - lru_cache.h
    - printer.h
    - status.hpp
    - mmap_impl_unix.h
    - fast_hash.h
    - array_slice_util.h
    - string_piece_test.cc
    - mmap_impl_win32.h
    - lazy_test.cc
    - format.h
    - seahash.h
    - murmur_hash.h
    - array_slice.h
    - memory.hpp
    - status_test.cpp
    - parse_utils_test.cc
    - string_piece.h
    - logging.cpp
    - printer.cc
    - codegen.h
    - parse_utils.h
    - sliceable_array.h
    - memory.cpp
    - CMakeLists.txt
    - assert.cc
    - hashing.h
    - mmap.h
    - stl_util.h
    - characters.cc
    - csv_reader_test.cc
    - stl_util_test.cc
    - memory_test.cpp
    - win32_utils.h
    - inlined_vector.h
    - char_buffer.h
    - logging.hpp
    - mmap.cc
    - fast_printer.h
    - seahash_test.cc
    - array_slice_util_test.cc
    - win32_utils.cc
    - csv_reader.h
    - flatmap.h
    - flatset.h
  - CMakeLists.txt
  - rnn
    - mikolov_rnn.cc
    - rnn_arg_parse.h
    - mikolov_rnn_impl.h
    - mikolov_rnn.h
    - CMakeLists.txt
    - legacy
      - rnnlmlib.h
      - rnnlmlib_static.cpp
      - rnnlmlib_static.h
    - mikolov_rnn_test.cc
    - simple_rnn_impl.h
  - core
    - impl
      - feature_impl_compute_test.cc
      - model_io_test.cc
      - feature_impl_pattern.h
      - feature_computer.cc
      - feature_impl_compute.cc
      - graphviz_format.cc
      - feature_impl_combine.cc
      - int_seq_util.h
      - model_format.h
      - feature_debug.cc
      - feature_impl_prim_test.cc
      - kvlist_test.cc
      - feature_impl_ngram_partial.cc
      - graphviz_format.h
      - feature_impl_prim.cc
      - feature_test.cc
      - feature_impl_pattern.cc
      - feature_impl_compute.h
      - feature_computer.h
      - model_io.cc
      - segmented_format.cc
      - feature_impl_combine_test.cc
      - feature_impl_ngram_partial.h
      - global_beam_position_fmt.cc
      - feature_impl_types.h
      - model_io.h
      - perceptron_io.h
      - CMakeLists.txt
      - aligned_field_test.cc
      - global_beam_position_fmt.h
      - model_format_ser.h
      - feature_types.h
      - feature_impl_prim.h
      - feature_impl_ngram_partial_kernels.h
      - feature_impl_ngram_partial_test.cc
      - feature_impl_combine.h
      - graphviz_format_test.cc
      - feature_debug.h
      - segmented_format.h
    - features_api.h
    - devtools
      - CMakeLists.txt
      - fasthash_quality.cc
      - strings_alignment_hinter.cc
    - proto
      - lattice_dump_output.cc
      - lattice_dump_output.h
      - CMakeLists.txt
      - lattice_dump.proto
    - training
      - partial_trainer.h
      - trainer.h
      - gold_example_test.cc
      - loss.h
      - training_test_common.h
      - full_example.cc
      - trainer_base.cc
      - gold_example.h
      - partial_example_train_test.cc
      - loss.cc
      - training_types.h
      - scw.h
      - training_env.cc
      - training_env.h
      - trainer_test.cc
      - scw.cc
      - partial_trainer.cc
      - trainer_base.h
      - gold_example.cc
      - CMakeLists.txt
      - trainer.cc
      - full_example.h
      - training_executor.h
      - gold_example2_test.cc
      - training_executor.cc
    - core_config.h.in
    - tool
      - train_cmd.cc
      - codegen_cmd.cc
      - codegen_cmd.h
      - index_cmd.h
      - train_cmd.h
      - CMakeLists.txt
      - index_cmd.cc
      - jumanpp_tool.cc
    - features_api.cc
    - test
      - test_analyzer_env.h
    - api
      - CMakeLists.txt
    - core_types.h
    - env.cc
    - benchmarks
      - codegen_bench_01.cc
      - perceptron_bench_2.cc
      - fasthash_bench.cc
      - perceptron_bench.cc
      - CMakeLists.txt
      - feature_hash_kernel_bench.cc
      - other
        spec_01.h
    - spec
      - spec_compiler.cc
      - spec_types.h
      - spec_dsl.cc
      - spec_serialization.cc
      - spec_parser_impl_test.cc
      - spec_serialization.h
      - spec_hashing.h
      - spec_parser_impl.cc
      - spec_dsl.h
      - spec_parser.h
      - spec_parser_impl.h
      - CMakeLists.txt
      - spec_compiler.h
      - spec_types.cc
      - spec_grammar_test.cc
      - spec_dsl_feature_test.cc
      - spec_parser.cc
      - spec_dsl_test.cc
      - spec_hashing.cc
      - spec_ser.h
      - spec_grammar.h
    - env.h
    - core.h
    - analysis
      - numeric_creator.h
      - extra_nodes.cc
      - lattice_builder.h
      - score_processor.cc
      - lattice_types.cc
      - score_plugin.h
      - extra_nodes.h
      - lattice_config.h
      - unk_maker_types.h
      - lattice_types_test.cc
      - rnn_serialization.h
      - output.h
      - unk_nodes_creator_test.cc
      - rnn_scorer.cc
      - lattice_types.h
      - onomatopoeia_creator.h
      - analyzer_impl_test.cc
      - analyzer_impl.h
      - dictionary_node_creator_test.cc
      - rnn_id_resolver_test.cc
      - lattice_builder_test.cc
      - lattice_compactor_test.cc
      - charlattice_test.cc
      - unk_nodes_creator.cc
      - normalized_node_creator_test.cc
      - dictionary_node_creator.h
      - numeric_creator.cc
      - lattice_builder.cc
      - innode_features.h
      - perceptron.h
      - charlattice.h
      - dic_reader.cc
      - ngram_computations.h
      - rnn_id_resolver.h
      - analysis_result.h
      - normalized_node_creator.h
      - analyzer_impl.cc
      - dic_reader.h
      - analyzer.cc
      - analysis_input.h
      - perceptron_test.cc
      - ngram_computations.cc
      - unk_nodes_creator.h
      - score_api.h
      - charlattice.cc
      - onomatopoeia_creator_test.cc
      - rnn_scorer.h
      - normalized_node_creator.cc
      - score_processor_test.cc
      - output.cc
      - dictionary_node_creator.cc
      - CMakeLists.txt
      - score_processor.h
      - innode_features.cc
      - perceptron.cc
      - analysis_input.cc
      - analysis_result.cc
      - numeric_creator_test.cc
      - onomatopoeia_creator.cc
      - rnn_scorer_test.cc
      - unk_nodes.cc
      - rnn_scorer_gbeam.cc
      - rnn_scorer_gbeam.h
      - lattice_config.cc
      - unk_nodes.h
      - rnn_id_resolver.cc
      - analyzer.h
    - CMakeLists.txt
    - codegen
      - partial_ngram_feature_codegen.h
      - feature_codegen.cc
      - ngram_feature_codegen.cc
      - codegen_main.cc.in
      - pattern_feature_codegen.h
      - feature_codegen_test.cc
      - pattern_feature_codegen.cc
      - cg_2_spec.h
      - CMakeLists.txt
      - partial_ngram_feature_codegen.cc
      - ngram_feature_codegen.h
      - pattern_codegen_test.cc
      - feature_codegen.h
    - core_version.h.in
    - core.cc
    - dic
      - darts_trie_test.cc
      - progress.h
      - field_reader.h
      - dic_feature_impl.cc
      - dic_build_detail.cc
      - dic_builder.h
      - entry_builder.h
      - dictionary_test.cc
      - field_import.h
      - darts_trie.cc
      - dic_deduplication_test.cc
      - dic_build_detail.h
      - darts.h
      - field_import.cc
      - dic_feature_impl.h
      - dic_entries.h
      - dictionary.h
      - CMakeLists.txt
      - darts_trie.h
      - dic_builder.cc
      - field_reader_test.cc
      - field_import_test.cc
      - dictionary.cc
      - entry_builder.cc
    - input
      - partial_example.cc
      - training_io.cc
      - partial_example_io.cc
      - stream_reader.h
      - training_io.h
      - partial_example_io_test.cc
      - pex_stream_reader.h
      - CMakeLists.txt
      - pex_stream_reader.cc
      - stream_reader.cc
      - partial_example.h
      - partial_example_io.h
  - testing
    - test_analyzer.h
    - read_file_test.cc
    - test_config.h.in
    - catch_main.cc
    - CMakeLists.txt
    - standalone_test.h
- cmake
  - JumanppStaticFeatures.cmake
  - GitUpdate.cmake
  - GitVersion.cmake
  - CPackJpp.cmake
  - ProtobufCompile.cmake
- version.cmake
- script
  - lattice2juman.rb
  - fmrpwdiff.sh
  - fullmrp2surf.rb
  - jumanpp-filter
  - mecab2juman.rb
  - git-clang-format.py
  - jmn2base.rb
  - fullmrp2idmrp.rb
  - eval_v2.py
  - corpus
    - fairy2jpppart.py
  - string2charlit.py
  - jmn2base.py
  - fullmrp2mrp.rb
  - JumanSexp.pm
  - fullmrp2mecab.rb
  - fullmrp2surfp.rb
  - jmn2rep_pos.rb
  - postprocess.py
  - lib
    - Grammar.pm
    - Sexp.pm
  - fullmrp2basep.rb
  - eval.pl
  - jmn2rep.rb
  - fullmrp2idraw.rb
  - fullmrp2raw.rb
  - eval_lattice.py
  - corpus2train.rb
  - __init__.py
  - sexp.py
  - create_grammar_map.pl
  - grammar_id_mapping.py
  - gen_feature_template.py
  - idmap.py
  - fullmrp2base.rb
  - juman2morph.rb
- sample
  - python_knp.py
  - sample.jppv2part
  - python_juman.py
  - atool.jppv2part
- CONTRIBUTORS
- libs
  - benchpress
    - cxxopts.hpp
    - benchpress.hpp
  - backward.hpp
  - pathie-cpp
    - src
      - errors.cpp
      - path.cpp
      - temp.cpp
      - pathie.cpp
      - pathie_ofstream.cpp
      - pathie_ifstream.cpp
      - entry_iterator.cpp
    - pathlist.md
    - CHANGELOG
    - include
      - pathie.hpp
      - errors.hpp
      - path.hpp
      - entry_iterator.hpp
      - pathie_ofstream.hpp
      - pathie_ifstream.hpp
      - temp.hpp
    - Doxyfile
    - examples
      - minils.cpp
      - decompose.cpp
      - findexample.cpp
      - tempdir.cpp
      - tempfile.cpp
    - LICENSE
    - test
      - pathie_streams.cpp
      - testhelpers.hpp
      - path.cpp
      - Rakefile
      - encodings.cpp
    - .dir-locals.el
    - CMakeLists.txt
    - README.md
    - .gitignore
    - INSTALL.md
    - links.txt
  - args.h
  - pegtl
    - file_input.hpp
    - uint32.hpp
    - version.hpp
    - rules.hpp
    - tracking_mode.hpp
    - nothing.hpp
    - ascii.hpp
    - istream_input.hpp
    - utf32.hpp
    - read_input.hpp
    - memory_input.hpp
    - rewind_mode.hpp
    - apply_mode.hpp
    - utf16.hpp
    - uint64.hpp
    - parse.hpp
    - position.hpp
    - analyze.hpp
    - input_error.hpp
    - eol.hpp
    - eol_pair.hpp
    - buffer_input.hpp
    - internal
      - bump_help.hpp
      - skip_control.hpp
      - bof.hpp
      - list_tail_pad.hpp
      - peek_uint8.hpp
      - demangle_sanitise.hpp
      - demangle_nop.hpp
      - not_at.hpp
      - plus.hpp
      - rep_min_max.hpp
      - input_pair.hpp
      - ranges.hpp
      - duseltronik.hpp
      - peek_mask_uint.hpp
      - disable.hpp
      - rules.hpp
      - cr_crlf_eol.hpp
      - pad_opt.hpp
      - file_opener.hpp
      - file_mapper.hpp
      - star_must.hpp
      - bytes.hpp
      - state.hpp
      - list_tail.hpp
      - raise.hpp
      - result_on_found.hpp
      - apply0.hpp
      - trivial.hpp
      - apply_single.hpp
      - endian_gcc.hpp
      - endian_win.hpp
      - if_must_else.hpp
      - one.hpp
      - string.hpp
      - if_must.hpp
      - cstring_reader.hpp
      - minus.hpp
      - require.hpp
      - bump_impl.hpp
      - dusel_mode.hpp
      - sor.hpp
      - demangle_cxxabi.hpp
      - enable.hpp
      - apply0_single.hpp
      - rep.hpp
      - action.hpp
      - has_apply.hpp
      - has_apply0.hpp
      - rep_min.hpp
      - action_input.hpp
      - alpha.hpp
      - any.hpp
      - peek_utf32.hpp
      - eof.hpp
      - lf_eol.hpp
      - eol.hpp
      - must.hpp
      - cr_eol.hpp
      - list_must.hpp
      - cstream_reader.hpp
      - demangle.hpp
      - alnum.hpp
      - if_apply.hpp
      - range.hpp
      - marker.hpp
      - iterator.hpp
      - identifier.hpp
      - at.hpp
      - integer_sequence.hpp
      - if_then_else.hpp
      - peek_utf16.hpp
      - list.hpp
      - lf_crlf_eol.hpp
      - crlf_eol.hpp
      - try_catch_type.hpp
      - star.hpp
      - pegtl_string.hpp
      - file_reader.hpp
      - pad.hpp
      - apply.hpp
      - istream_reader.hpp
      - control.hpp
      - rep_opt.hpp
      - peek_mask_uint8.hpp
      - rule_conjunction.hpp
      - until.hpp
      - discard.hpp
      - peek_utf8.hpp
      - endian.hpp
      - seq.hpp
      - peek_uint.hpp
      - peek_char.hpp
      - istring.hpp
      - bol.hpp
      - opt.hpp
      - read_uint.hpp
      - eolf.hpp
    - uint16.hpp
    - mmap_input.hpp
    - analysis
      - insert_guard.hpp
      - grammar_info.hpp
      - generic.hpp
      - rule_info.hpp
      - counted.hpp
      - rule_type.hpp
      - insert_rules.hpp
      - analyze_cycles.hpp
    - config.hpp
    - cstream_input.hpp
    - parse_error.hpp
    - normal.hpp
    - uint8.hpp
    - string_input.hpp
    - contrib
      - json.hpp
      - changes.hpp
      - uri.hpp
      - raw_string.hpp
      - http.hpp
      - integer.hpp
      - tracer.hpp
      - alphabet.hpp
      - rep_one_min_max.hpp
      - to_string.hpp
      - unescape.hpp
      - parse_tree.hpp
      - icu
        utf32.hpp
        internal.hpp
        utf16.hpp
        utf8.hpp
      - abnf.hpp
      - counter.hpp
    - argv_input.hpp
    - utf8.hpp
  - CMakeLists.txt
  - README.md
  - .gitignore
- do_format.sh
- LICENSE
- test
  - jumandic
    - bug-28-lattice.mdic
    - bug950111-003.mdic
    - codegen.mdic
    - bug950111-003.in
    - partial_01.data
    - jumanpp_minimal.mdic
    - bug-28-lattice.in
    - train_mini_01.txt
    - unk_ex.data
  - csv
    - small.csv
  - CMakeLists.txt
  - rnn
    - .gitignore
  - label.txt
- travis
  - cmake.conf
- .clang-format
- CMakeLists.txt
- model-template
  - jumandic.conf.in
  - CMakeLists.txt
- .travis.yml
- README.md
- .gitignore
- docs
  - analysis.md
  - README_ja_v1.md
  - dictionary.md
  - building.md
  - output.md
  - spec.md
- make_release.sh

//
// Created by Arseny Tolmachev on 2017/10/10.
//

#include "full_example.h"

namespace jumanpp {
namespace core {
namespace training {

Status FullExampleReader::readFullExample(FullyAnnotatedExample *result) {
  result->data_.clear();
  result->lengths_.clear();

  switch (mode_) {
    case DataReaderMode::SimpleCsv:
      return readFullExampleCsv(result);
    case DataReaderMode::DoubleCsv:
      return readFullExampleDblCsv(result);
  }
  return JPPS_NOT_IMPLEMENTED << "example type " << (int)mode_
                              << " is not implemented";
}

Status FullExampleReader::initCsv(StringPiece data) {
  JPP_DCHECK(tio_ != nullptr);
  if (tio_->fields().empty()) {
    return Status::InvalidState()
           << "field data is not initialized, you must do that first";
  }

  mode_ = DataReaderMode::SimpleCsv;
  csv_ = util::CsvReader();
  finished_ = false;
  return csv_.initFromMemory(data);
}

Status FullExampleReader::initDoubleCsv(StringPiece data, char tokenSep,
                                        char fieldSep) {
  JPP_DCHECK(tio_ != nullptr);
  if (tio_->fields().empty()) {
    return Status::InvalidState()
           << "field data is not initialized, you must do that first";
  }

  mode_ = DataReaderMode::DoubleCsv;
  doubleFldSep_ = fieldSep;
  csv_ = util::CsvReader{tokenSep};
  csv2_ = util::CsvReader{fieldSep};
  finished_ = false;
  return csv_.initFromMemory(data);
}

bool startsWith(StringPiece s1, StringPiece s2) {
  if (s2.size() > s1.size()) return false;
  return s1.take(s2.size()) == s2;
}

Status FullExampleReader::readFullExampleCsv(FullyAnnotatedExample *result) {
  while (csv_.nextLine()) {
    if (csv_.numFields() == 1 &&
        (csv_.field(0).empty() || csv_.field(0) == "EOS")) {
      return Status::Ok();
    }

    auto line = csv_.line();
    if (startsWith(line, "# ")) {
      line.slice(0, line.size() - 1).assignTo(result->comment_);
      continue;
    }

    JPP_RETURN_IF_ERROR(readSingleExampleFragment(csv_, result));
  }
  finished_ = true;
  return Status::Ok();
}

Status FullExampleReader::readFullExampleDblCsv(FullyAnnotatedExample *result) {
  finished_ = !csv_.nextLine();
  if (finished_) {
    return Status::Ok();
  }

  auto numwords = csv_.numFields();
  result->data_.reserve(numwords * tio_->fields().size());
  result->lengths_.reserve(numwords);
  for (int i = 0; i < numwords; ++i) {
    auto content = csv_.field(i);

    if (content == "#") {
      for (int j = i + 1; j < numwords; ++j) {
        auto commentPart = csv_.field(j);
        result->comment_.append(commentPart.char_begin(),
                                commentPart.char_end());
        if (j != numwords - 1) {
          result->comment_.push_back(csv_.separator());
        }
      }
      break;
    }

    JPP_RETURN_IF_ERROR(csv2_.initFromMemory(content));
    if (!csv2_.nextLine()) {
      return JPPS_INVALID_PARAMETER << "failed to read word #" << i
                                    << " from the line #" << csv_.lineNumber();
    }
    JPP_RIE_MSG(readSingleExampleFragment(csv2_, result), "data=" << content);
  }

  return Status::Ok();
}

Status FullExampleReader::readSingleExampleFragment(
    const util::CsvReader &csv, FullyAnnotatedExample *result) {
  codepts_.clear();
  auto surfFld = csv.field(tio_->surfaceFieldIdx());

  JPP_RETURN_IF_ERROR(chars::preprocessRawData(surfFld, &codepts_));
  result->lengths_.push_back(codepts_.size());

  auto fields = tio_->fields();

  if (csv.numFields() < fields.size()) {
    return JPPS_INVALID_PARAMETER
           << "a word from the line #" << csv_.lineNumber() << " had "
           << csv.numFields() << " fields, expected " << fields.size();
  }
  result->surface_.append(surfFld.char_begin(), surfFld.char_end());
  for (int i = 0; i < fields.size(); ++i) {
    auto &fldInfo = fields[i];
    auto &map = *fldInfo.str2int;
    auto fld = csv.field(fldInfo.exampleFieldIdx);
    auto it = map.find(fld);
    if (it == map.end()) {
      if (csv_.rowHadQuoted()) {
        if (!charBuffer_.import(&fld)) {
          return JPPS_INVALID_PARAMETER << "failed to import string at file "
                                        << filename_ << ":" << csv_.lineNumber()
                                        << " data=" << fld;
        }
      }
      i32 item = static_cast<i32>(result->strings_.size());
      result->strings_.emplace_back(fld);
      result->data_.push_back(~item);
    } else {
      result->data_.push_back(it->second);
    }
  }
  return Status::Ok();
}

}  // namespace training
}  // namespace core
}  // namespace jumanpp