/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.streaming; import java.io.*; import java.util.regex.*; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; /** A way to interpret XML fragments as Mapper input records. * Values are XML subtrees delimited by configurable tags. * Keys could be the value of a certain attribute in the XML subtree, * but this is left to the stream processor application. * * The name-value properties that StreamXmlRecordReader understands are: * String begin (chars marking beginning of record) * String end (chars marking end of record) * int maxrec (maximum record size) * int lookahead(maximum lookahead to sync CDATA) * boolean slowmatch */ public class StreamXmlRecordReader extends StreamBaseRecordReader { public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter, JobConf job, FileSystem fs) throws IOException { super(in, split, reporter, job, fs); beginMark_ = checkJobGet(CONF_NS + "begin"); endMark_ = checkJobGet(CONF_NS + "end"); maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000); lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_); synched_ = false; slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false); if (slowMatch_) { beginPat_ = makePatternCDataOrMark(beginMark_); endPat_ = makePatternCDataOrMark(endMark_); } init(); } public void init() throws IOException { LOG.info("StreamBaseRecordReader.init: " + " start_=" + start_ + " end_=" + end_ + " length_=" + length_ + " start_ > in_.getPos() =" + (start_ > in_.getPos()) + " " + start_ + " > " + in_.getPos()); if (start_ > in_.getPos()) { in_.seek(start_); } pos_ = start_; bin_ = new BufferedInputStream(in_); seekNextRecordBoundary(); } int numNext = 0; public synchronized boolean next(Text key, Text value) throws IOException { numNext++; if (pos_ >= end_) { return false; } DataOutputBuffer buf = new DataOutputBuffer(); if (!readUntilMatchBegin()) { return false; } if (!readUntilMatchEnd(buf)) { return false; } // There is only one elem..key/value splitting is not done here. byte[] record = new byte[buf.getLength()]; System.arraycopy(buf.getData(), 0, record, 0, record.length); numRecStats(record, 0, record.length); key.set(record); value.set(""); return true; } public void seekNextRecordBoundary() throws IOException { readUntilMatchBegin(); } boolean readUntilMatchBegin() throws IOException { if (slowMatch_) { return slowReadUntilMatch(beginPat_, false, null); } else { return fastReadUntilMatch(beginMark_, false, null); } } private boolean readUntilMatchEnd(DataOutputBuffer buf) throws IOException { if (slowMatch_) { return slowReadUntilMatch(endPat_, true, buf); } else { return fastReadUntilMatch(endMark_, true, buf); } } private boolean slowReadUntilMatch(Pattern markPattern, boolean includePat, DataOutputBuffer outBufOrNull) throws IOException { byte[] buf = new byte[Math.max(lookAhead_, maxRecSize_)]; int read = 0; bin_.mark(Math.max(lookAhead_, maxRecSize_) + 2); //mark to invalidate if we read more read = bin_.read(buf); if (read == -1) return false; String sbuf = new String(buf, 0, read, "UTF-8"); Matcher match = markPattern.matcher(sbuf); firstMatchStart_ = NA; firstMatchEnd_ = NA; int bufPos = 0; int state = synched_ ? CDATA_OUT : CDATA_UNK; int s = 0; while (match.find(bufPos)) { int input; if (match.group(1) != null) { input = CDATA_BEGIN; } else if (match.group(2) != null) { input = CDATA_END; firstMatchStart_ = NA; // |<DOC CDATA[ </DOC> ]]> should keep it } else { input = RECORD_MAYBE; } if (input == RECORD_MAYBE) { if (firstMatchStart_ == NA) { firstMatchStart_ = match.start(); firstMatchEnd_ = match.end(); } } state = nextState(state, input, match.start()); if (state == RECORD_ACCEPT) { break; } bufPos = match.end(); s++; } if (state != CDATA_UNK) { synched_ = true; } boolean matched = (firstMatchStart_ != NA) && (state == RECORD_ACCEPT || state == CDATA_UNK); if (matched) { int endPos = includePat ? firstMatchEnd_ : firstMatchStart_; bin_.reset();