package xmlparsing.demo;
importjavax.xml.stream.XMLInputFactory;
//import mrdp.logging.LogWriter;
importorg.apache.hadoop.fs.FileSystem;
importorg.apache.hadoop.fs.Path;
importorg.apache.hadoop.conf.*;
importorg.apache.hadoop.io.*;
importorg.apache.hadoop.mapred.TextOutputFormat;
importorg.apache.hadoop.mapreduce.*;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
importorg.apache.hadoop.util.GenericOptionsParser;
publicclassXMLDriver {
publicstaticvoidmain(String[] args) {
try{
Configuration conf = newConfiguration();
String[] arg = newGenericOptionsParser(conf, args).getRemainingArgs();
conf.set(“START_TAG_KEY”, “<employee>”);
conf.set(“END_TAG_KEY”, “</employee>”);
Job job = newJob(conf, “XML Processing Processing”);
job.setJarByClass(XMLDriver.class);
job.setMapperClass(MyMapper.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(XmlInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(job, newPath(args[0]));
FileOutputFormat.setOutputPath(job, newPath(args[1]));
job.waitForCompletion(true);
} catch(Exception e) {
LogWriter.getInstance().WriteLog(“Driver Error: “+ e.getMessage());
System.out.println(e.getMessage().toString());
}
}
}
Step 3: Create MyMapper.javapackage xmlparsing.demo;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
//import mrdp.logging.LogWriter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private static final Log LOG = LogFactory.getLog(MyMapper.class);
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
InputStream is = new ByteArrayInputStream(value.toString().getBytes());
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(is);
doc.getDocumentElement().normalize();
NodeList nList = doc.getElementsByTagName(“employee”);
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
if (nNode.getNodeType() == Node.ELEMENT_NODE) {
Element eElement = (Element) nNode;
String id = eElement.getElementsByTagName(“id”).item(0).getTextContent();
String name = eElement.getElementsByTagName(“name”).item(0).getTextContent();
String gender = eElement.getElementsByTagName(“gender”).item(0).getTextContent();
// System.out.println(id + “,” + name + “,” + gender);
context.write(new Text(id + “,” + name + “,” + gender), NullWritable.get());
}
}
} catch (Exception e) {
LogWriter.getInstance().WriteLog(e.getMessage());
}
}
}