| 1 | /** |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | * contributor license agreements. See the NOTICE file distributed with |
| 4 | * this work for additional information regarding copyright ownership. |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | * (the "License"); you may not use this file except in compliance with |
| 7 | * the License. You may obtain a copy of the License at |
| 8 | * |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | * |
| 11 | * Unless required by applicable law or agreed to in writing, software |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | * See the License for the specific language governing permissions and |
| 15 | * limitations under the License. |
| 16 | */ |
| 17 | |
| 18 | package felix.thirdpart; |
| 19 | |
| 20 | import java.io.IOException; |
| 21 | |
| 22 | import com.google.common.base.Charsets; |
| 23 | import com.google.common.io.Closeables; |
| 24 | import org.apache.hadoop.conf.Configuration; |
| 25 | import org.apache.hadoop.fs.FSDataInputStream; |
| 26 | import org.apache.hadoop.fs.FileSystem; |
| 27 | import org.apache.hadoop.fs.Path; |
| 28 | import org.apache.hadoop.io.DataOutputBuffer; |
| 29 | import org.apache.hadoop.io.LongWritable; |
| 30 | import org.apache.hadoop.io.Text; |
| 31 | import org.apache.hadoop.mapreduce.InputSplit; |
| 32 | import org.apache.hadoop.mapreduce.RecordReader; |
| 33 | import org.apache.hadoop.mapreduce.TaskAttemptContext; |
| 34 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; |
| 35 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; |
| 36 | import org.slf4j.Logger; |
| 37 | import org.slf4j.LoggerFactory; |
| 38 | |
| 39 | /** |
| 40 | * Reads records that are delimited by a specific begin/end tag |
| 41 | * -- ACK: THIS THIRD-PART CLASS IS NOT WRITTEN BY FELIX'S AUTHORS. |
| 42 | * |
| 43 | */ |
| 44 | public class XmlInputFormat extends TextInputFormat { |
| 45 | |
| 46 | private static final Logger log = LoggerFactory.getLogger(XmlInputFormat.class); |
| 47 | |
| 48 | public static final String START_TAG_KEY = "xmlinput.start"; |
| 49 | public static final String END_TAG_KEY = "xmlinput.end"; |
| 50 | |
| 51 | /** |
| 52 | * Returns XMLRecord reader to read xml document. |
| 53 | */ |
| 54 | @Override |
| 55 | public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { |
| 56 | try { |
| 57 | return new XmlRecordReader((FileSplit) split, context.getConfiguration()); |
| 58 | } catch (IOException ioe) { |
| 59 | log.warn("Error while creating XmlRecordReader", ioe); |
| 60 | return null; |
| 61 | } |
| 62 | } |
| 63 | |
| 64 | /** |
| 65 | * XMLRecordReader class to read through a given xml document to output xml blocks as records as specified |
| 66 | * by the start tag and end tag |
| 67 | * |
| 68 | */ |
| 69 | public static class XmlRecordReader extends RecordReader<LongWritable, Text> { |
| 70 | |
| 71 | private final byte[] startTag; |
| 72 | private final byte[] endTag; |
| 73 | private final long start; |
| 74 | private final long end; |
| 75 | private final FSDataInputStream fsin; |
| 76 | private final DataOutputBuffer buffer = new DataOutputBuffer(); |
| 77 | private LongWritable currentKey; |
| 78 | private Text currentValue; |
| 79 | |
| 80 | /** |
| 81 | * The constructor. |
| 82 | * @param split |
| 83 | * @param conf |
| 84 | * @throws IOException |
| 85 | */ |
| 86 | public XmlRecordReader(FileSplit split, Configuration conf) throws IOException { |
| 87 | startTag = conf.get(START_TAG_KEY).getBytes(Charsets.UTF_8); |
| 88 | endTag = conf.get(END_TAG_KEY).getBytes(Charsets.UTF_8); |
| 89 | |
| 90 | // open the file and seek to the start of the split |
| 91 | start = split.getStart(); |
| 92 | end = start + split.getLength(); |
| 93 | Path file = split.getPath(); |
| 94 | FileSystem fs = file.getFileSystem(conf); |
| 95 | fsin = fs.open(split.getPath()); |
| 96 | fsin.seek(start); |
| 97 | } |
| 98 | |
| 99 | /** |
| 100 | * Sets next key, value and returns true if possible. |
| 101 | * @param key |
| 102 | * @param value |
| 103 | * @return |
| 104 | * @throws IOException |
| 105 | */ |
| 106 | private boolean next(LongWritable key, Text value) throws IOException { |
| 107 | if (fsin.getPos() < end && readUntilMatch(startTag, false)) { |
| 108 | try { |
| 109 | buffer.write(startTag); |
| 110 | if (readUntilMatch(endTag, true)) { |
| 111 | key.set(fsin.getPos()); |
| 112 | value.set(buffer.getData(), 0, buffer.getLength()); |
| 113 | return true; |
| 114 | } |
| 115 | } finally { |
| 116 | buffer.reset(); |
| 117 | } |
| 118 | } |
| 119 | return false; |
| 120 | } |
| 121 | |
| 122 | @Override |
| 123 | public void close() throws IOException { |
| 124 | Closeables.closeQuietly(fsin); |
| 125 | } |
| 126 | |
| 127 | /** |
| 128 | * Returns progress through file. |
| 129 | */ |
| 130 | @Override |
| 131 | public float getProgress() throws IOException { |
| 132 | return (fsin.getPos() - start) / (float) (end - start); |
| 133 | } |
| 134 | |
| 135 | /** |
| 136 | * Reads until given match is found. |
| 137 | * @param match |
| 138 | * @param withinBlock |
| 139 | * @return |
| 140 | * @throws IOException |
| 141 | */ |
| 142 | private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException { |
| 143 | int i = 0; |
| 144 | while (true) { |
| 145 | int b = fsin.read(); |
| 146 | // end of file: |
| 147 | if (b == -1) { |
| 148 | return false; |
| 149 | } |
| 150 | // save to buffer: |
| 151 | if (withinBlock) { |
| 152 | buffer.write(b); |
| 153 | } |
| 154 | |
| 155 | // check if we're matching: |
| 156 | if (b == match[i]) { |
| 157 | i++; |
| 158 | if (i >= match.length) { |
| 159 | return true; |
| 160 | } |
| 161 | } else { |
| 162 | i = 0; |
| 163 | } |
| 164 | // see if we've passed the stop point: |
| 165 | if (!withinBlock && i == 0 && fsin.getPos() >= end) { |
| 166 | return false; |
| 167 | } |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | /** |
| 172 | * Returns current key. |
| 173 | */ |
| 174 | @Override |
| 175 | public LongWritable getCurrentKey() throws IOException, InterruptedException { |
| 176 | return currentKey; |
| 177 | } |
| 178 | |
| 179 | /** |
| 180 | * Returns current value. |
| 181 | */ |
| 182 | @Override |
| 183 | public Text getCurrentValue() throws IOException, InterruptedException { |
| 184 | return currentValue; |
| 185 | } |
| 186 | |
| 187 | @Override |
| 188 | public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { |
| 189 | } |
| 190 | |
| 191 | /** |
| 192 | * Sets next key, value. |
| 193 | */ |
| 194 | @Override |
| 195 | public boolean nextKeyValue() throws IOException, InterruptedException { |
| 196 | currentKey = new LongWritable(); |
| 197 | currentValue = new Text(); |
| 198 | return next(currentKey, currentValue); |
| 199 | } |
| 200 | } |
| 201 | } |