/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;

import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT;

/**
 * A block-based terms index and dictionary that assigns terms to nearly
 * uniform length blocks. This technique is called Uniform Split.
 * <p>
 * The block construction is driven by two parameters, {@code targetNumBlockLines}
 * and {@code deltaNumLines}.
 * Each block size (number of terms) is {@code targetNumBlockLines}+-{@code deltaNumLines}.
 * The algorithm computes the minimal distinguishing prefix (MDP) between
 * each term and its previous term (alphabetically ordered). Then it selects
 * in the neighborhood of the {@code targetNumBlockLines}, and within the
 * {@code deltaNumLines}, the term with the minimal MDP. This term becomes
 * the first term of the next block and its MDP is the block key. This block
 * key is added to the terms dictionary trie.
 * <p>
 * We call dictionary the trie structure in memory, and block file the disk file
 * containing the block lines, with one term and its corresponding term state
 * details per line.
 * <p>
 * When seeking a term, the dictionary seeks the floor leaf of the trie for the
 * searched term and jumps to the corresponding file pointer in the block file.
 * There, the block terms are scanned for the exact searched term.
 * <p>
 * The terms inside a block do not need to share a prefix. Only the block
 * key is used to find the block from the dictionary trie. And the block key
 * is selected because it is the locally smallest MDP. This makes the dictionary
 * trie very compact.
 * <p>
 * An interesting property of the Uniform Split technique is the very linear
 * balance between memory usage and lookup performance. By decreasing
 * the target block size, the block scan becomes faster, and since there are
 * more blocks, the dictionary trie memory usage increases. Additionally,
 * small blocks are faster to read from disk. A good sweet spot for the target
 * block size is 32 with delta of 3 (10%) (default values). This can be tuned
 * in the constructor.
 * <p>
 * There are additional optimizations:
 * <ul>
 * <li>Each block has a header that allows the lookup to jump directly to
 * the middle term with a fast comparison. This reduces the linear scan
 * by 2 for a small disk size increase.</li>
 * <li>Each block term is incrementally encoded according to its previous
 * term. This both reduces the disk size and speeds up the block scan.</li>
 * <li>All term line details (the terms states) are written after all terms. This
 * allows faster term scan without needing to decode the term states.</li>
 * <li>All file pointers are base-encoded. Their value is relative to the block
 * base file pointer (not to the previous file pointer), this allows to read the
 * term state of any term independently.</li>
 * </ul>
 * <p>
 * Blocks can be compressed or encrypted with an optional {@link BlockEncoder}
 * provided in the {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, BlockEncoder) constructor}.
 * <p>
 * The {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file}
 * contains all the term blocks for each field sequentially. It also contains
 * the fields metadata at the end of the file.
 * <p>
 * The {@link UniformSplitPostingsFormat#TERMS_DICTIONARY_EXTENSION dictionary file}
 * contains the trie ({@link org.apache.lucene.util.fst.FST} bytes) for each
 * field sequentially.
 *
 * @lucene.experimental
 */
public class UniformSplitTermsWriter extends FieldsConsumer {

  /**
   * Default value for the target block size (number of terms per block).
   */
  public static final int DEFAULT_TARGET_NUM_BLOCK_LINES = 32;
  /**
   * Default value for the maximum allowed delta variation of the block size (delta of the number of terms per block).
   * The block size will be [target block size]+-[allowed delta].
   */
  public static final int DEFAULT_DELTA_NUM_LINES = (int) (DEFAULT_TARGET_NUM_BLOCK_LINES * 0.1);
  /**
   * Upper limit of the block size (maximum number of terms per block).
   */
  protected static final int MAX_NUM_BLOCK_LINES = 1_000;

  protected final FieldInfos fieldInfos;
  protected final PostingsWriterBase postingsWriter;
  protected final int maxDoc;

  protected final int targetNumBlockLines;
  protected final int deltaNumLines;

  protected final BlockEncoder blockEncoder;
  protected final FieldMetadata.Serializer fieldMetadataWriter;
  protected final IndexOutput blockOutput;
  protected final IndexOutput dictionaryOutput;

  /**
   * @param blockEncoder Optional block encoder, may be null if none.
   *                     It can be used for compression or encryption.
   */
  public UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state,
                          BlockEncoder blockEncoder) throws IOException {
    this(postingsWriter, state, DEFAULT_TARGET_NUM_BLOCK_LINES, DEFAULT_DELTA_NUM_LINES, blockEncoder);
  }

  /**
   * @param blockEncoder Optional block encoder, may be null if none.
   *                     It can be used for compression or encryption.
   */
  public UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state,
                          int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder) throws IOException {
    this(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, FieldMetadata.Serializer.INSTANCE,
        NAME, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION);
  }


  /**
   * @param targetNumBlockLines Target number of lines per block.
   *                            Must be strictly greater than 0.
   *                            The parameters can be pre-validated with {@link #validateSettings(int, int)}.
   *                            There is one term per block line, with its corresponding details ({@link org.apache.lucene.index.TermState}).
   * @param deltaNumLines       Maximum allowed delta variation of the number of lines per block.
   *                            Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
   *                            The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}.
   *                            The block size must always be less than or equal to {@link #MAX_NUM_BLOCK_LINES}.
   * @param blockEncoder        Optional block encoder, may be null if none.
   *                            It can be used for compression or encryption.
   */
  protected UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state,
                          int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, FieldMetadata.Serializer fieldMetadataWriter,
                          String codecName, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException {
    validateSettings(targetNumBlockLines, deltaNumLines);
    IndexOutput blockOutput = null;
    IndexOutput dictionaryOutput = null;
    boolean success = false;
    try {
      this.fieldInfos = state.fieldInfos;
      this.postingsWriter = postingsWriter;
      this.maxDoc = state.segmentInfo.maxDoc();
      this.targetNumBlockLines = targetNumBlockLines;
      this.deltaNumLines = deltaNumLines;
      this.blockEncoder = blockEncoder;
      this.fieldMetadataWriter = fieldMetadataWriter;

      String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, termsBlocksExtension);
      blockOutput = state.directory.createOutput(termsName, state.context);
      CodecUtil.writeIndexHeader(blockOutput, codecName, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);

      String indexName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dictionaryExtension);
      dictionaryOutput = state.directory.createOutput(indexName, state.context);
      CodecUtil.writeIndexHeader(dictionaryOutput, codecName, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);

      postingsWriter.init(blockOutput, state);

      this.blockOutput = blockOutput;
      this.dictionaryOutput = dictionaryOutput;
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(blockOutput, dictionaryOutput);
      }
    }
  }

  /**
   * Validates the {@link #UniformSplitTermsWriter(PostingsWriterBase, SegmentWriteState, int, int, BlockEncoder) constructor}
   * settings.
   *
   * @param targetNumBlockLines Target number of lines per block.
   *                            Must be strictly greater than 0.
   * @param deltaNumLines       Maximum allowed delta variation of the number of lines per block.
   *                            Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
   *                            Additionally, {@code targetNumBlockLines} + {@code deltaNumLines} must be less than
   *                            or equal to {@link #MAX_NUM_BLOCK_LINES}.
   */
  protected static void validateSettings(int targetNumBlockLines, int deltaNumLines) {
    if (targetNumBlockLines <= 0) {
      throw new IllegalArgumentException("Invalid negative or nul targetNumBlockLines=" + targetNumBlockLines);
    }
    if (deltaNumLines < 0) {
      throw new IllegalArgumentException("Invalid negative deltaNumLines=" + deltaNumLines);
    }
    if (deltaNumLines >= targetNumBlockLines) {
      throw new IllegalArgumentException("Invalid too large deltaNumLines=" + deltaNumLines
          + ", it must be < targetNumBlockLines=" + targetNumBlockLines);
    }
    if (targetNumBlockLines + deltaNumLines > UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES) {
      throw new IllegalArgumentException("Invalid (targetNumBlockLines + deltaNumLines)="
          + (targetNumBlockLines + deltaNumLines) + ", it must be <= MAX_NUM_BLOCK_LINES="
          + UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES);
    }
  }

  @Override
  public void write(Fields fields, NormsProducer normsProducer) throws IOException {
    BlockWriter blockWriter = new BlockWriter(blockOutput, targetNumBlockLines, deltaNumLines, blockEncoder);
    ByteBuffersDataOutput fieldsOutput = new ByteBuffersDataOutput();
    int fieldsNumber = 0;
    for (String field : fields) {
      Terms terms = fields.terms(field);
      if (terms != null) {
        TermsEnum termsEnum = terms.iterator();
        FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
        fieldsNumber += writeFieldTerms(blockWriter, fieldsOutput, termsEnum, fieldInfo, normsProducer);
      }
    }
    writeFieldsMetadata(fieldsNumber, fieldsOutput);
    CodecUtil.writeFooter(dictionaryOutput);
  }

  protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException {
    long fieldsStartPosition = blockOutput.getFilePointer();
    blockOutput.writeVInt(fieldsNumber);
    if (blockEncoder == null) {
      writeUnencodedFieldsMetadata(fieldsOutput);
    } else {
      writeEncodedFieldsMetadata(fieldsOutput);
    }
    // Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata.
    blockOutput.writeLong(fieldsStartPosition);
    CodecUtil.writeFooter(blockOutput);
  }

  protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
    fieldsOutput.copyTo(blockOutput);
  }

  protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
    BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size());
    blockOutput.writeVLong(encodedBytes.size());
    encodedBytes.writeTo(blockOutput);
  }

  /**
   * @return 1 if the field was written; 0 otherwise.
   */
  protected int writeFieldTerms(BlockWriter blockWriter, DataOutput fieldsOutput, TermsEnum termsEnum,
                              FieldInfo fieldInfo, NormsProducer normsProducer) throws IOException {

    FieldMetadata fieldMetadata = new FieldMetadata(fieldInfo, maxDoc);
    fieldMetadata.setDictionaryStartFP(dictionaryOutput.getFilePointer());

    postingsWriter.setField(fieldInfo);
    blockWriter.setField(fieldMetadata);
    IndexDictionary.Builder dictionaryBuilder = new FSTDictionary.Builder();
    BytesRef lastTerm = null;
    while (termsEnum.next() != null) {
      BlockTermState blockTermState = writePostingLine(termsEnum, fieldMetadata, normsProducer);
      if (blockTermState != null) {
        lastTerm = BytesRef.deepCopyOf(termsEnum.term());
        blockWriter.addLine(lastTerm, blockTermState, dictionaryBuilder);
      }
    }

    // Flush remaining terms.
    blockWriter.finishLastBlock(dictionaryBuilder);

    if (fieldMetadata.getNumTerms() > 0) {
      fieldMetadata.setLastTerm(lastTerm);
      fieldMetadataWriter.write(fieldsOutput, fieldMetadata);
      writeDictionary(dictionaryBuilder);
      return 1;
    }
    return 0;
  }

  /**
   * Writes the posting values for the current term in the given {@link TermsEnum}
   * and updates the {@link FieldMetadata} stats.
   *
   * @return the written {@link BlockTermState}; or null if none.
   */
  protected BlockTermState writePostingLine(TermsEnum termsEnum, FieldMetadata fieldMetadata, NormsProducer normsProducer) throws IOException {
    BlockTermState state = postingsWriter.writeTerm(termsEnum.term(), termsEnum, fieldMetadata.getDocsSeen(), normsProducer);
    if (state == null) {
      // No doc for this term.
      return null;
    }
    fieldMetadata.updateStats(state);
    return state;
  }

  /**
   * Writes the dictionary index (FST) to disk.
   */
  protected void writeDictionary(IndexDictionary.Builder dictionaryBuilder) throws IOException {
    dictionaryBuilder.build().write(dictionaryOutput, blockEncoder);
  }

  @Override
  public void close() throws IOException {
    IOUtils.close(blockOutput, dictionaryOutput, postingsWriter);
  }
}