HTMLPARSER JAVA CODE

  1 /***
  2  * Licensed to the Apache Software Foundation (ASF) under one or more
  3  * contributor license agreements.  See the NOTICE file distributed with
  4  * this work for additional information regarding copyright ownership.
  5  * The ASF licenses this file to You under the Apache License, Version 2.0
  6  * (the "License"); you may not use this file except in compliance with
  7  * the License.  You may obtain a copy of the License at
  8  *
  9  *     http://www.apache.org/licenses/LICENSE-2.0
 10  *
 11  * Unless required by applicable law or agreed to in writing, software
 12  * distributed under the License is distributed on an "AS IS" BASIS,
 13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  * See the License for the specific language governing permissions and
 15  * limitations under the License.
 16  */
 17 
 18 package org.apache.nutch.parse.html;
 19 
 20 import java.util.ArrayList;
 21 import java.util.Map;
 22 import java.net.URL;
 23 import java.net.MalformedURLException;
 24 import java.nio.charset.Charset;
 25 import java.io.*;
 26 import java.util.regex.*;
 27 
 28 import org.cyberneko.html.parsers.*;
 29 import org.xml.sax.InputSource;
 30 import org.xml.sax.SAXException;
 31 import org.w3c.dom.*;
 32 import org.apache.html.dom.*;
 33 
 34 import org.apache.commons.logging.Log;
 35 import org.apache.commons.logging.LogFactory;
 36 
 37 import org.apache.nutch.metadata.Metadata;
 38 import org.apache.nutch.metadata.Nutch;
 39 import org.apache.nutch.protocol.Content;
 40 import org.apache.hadoop.conf.*;
 41 import org.apache.nutch.parse.*;
 42 import org.apache.nutch.util.*;
 43 
 44 public class HtmlParser implements Parser {
 45   public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.html");
 46 
 47   // I used 1000 bytes at first, but  found that some documents have 
 48   // meta tag well past the first 1000 bytes. 
 49   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
 50   private static final int CHUNK_SIZE = 2000;
 51   private static Pattern metaPattern =
 52     Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
 53                     Pattern.CASE_INSENSITIVE);
 54   private static Pattern charsetPattern =
 55     Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
 56                     Pattern.CASE_INSENSITIVE);
 57   
 58   private String parserImpl;
 59 
 60   /***
 61    * Given a <code>byte[]</code> representing an html file of an 
 62    * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag   
 63    * from the first <code>CHUNK_SIZE</code> bytes.
 64    * If there's no meta tag for Content-Type or no charset is specified,
 65    * <code>null</code> is returned.  <br />
 66    * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
 67    * can't be handled with this. 
 68    * We need to do something similar to what's done by mozilla
 69    * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
 70    * See also http://www.w3.org/TR/REC-xml/#sec-guessing
 71    * <br />
 72    *
 73    * @param content <code>byte[]</code> representation of an html file
 74    */
 75 
 76   private static String sniffCharacterEncoding(byte[] content) {
 77     int length = content.length < CHUNK_SIZE ? 
 78                  content.length : CHUNK_SIZE;
 79 
 80     // We don't care about non-ASCII parts so that it's sufficient
 81     // to just inflate each byte to a 16-bit value by padding. 
 82     // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into 
 83     // {U+0041, U+0082, U+00B7}. 
 84     String str = "";
 85     try {
 86       str = new String(content, 0, length,
 87                        Charset.forName("ASCII").toString());
 88     } catch (UnsupportedEncodingException e) {
 89       // code should never come here, but just in case... 
 90       return null;
 91     }
 92 
 93     Matcher metaMatcher = metaPattern.matcher(str);
 94     String encoding = null;
 95     if (metaMatcher.find()) {
 96       Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
 97       if (charsetMatcher.find()) 
 98         encoding = new String(charsetMatcher.group(1));
 99     }
100 
101     return encoding;
102   }
103 
104   private String defaultCharEncoding;
105 
106   private Configuration conf;
107   
108   private DOMContentUtils utils;
109 
110   private HtmlParseFilters htmlParseFilters;
111   
112   private String cachingPolicy;
113   
114   public ParseResult getParse(Content content) {
115     HTMLMetaTags metaTags = new HTMLMetaTags();
116 
117     URL base;
118     try {
119       base = new URL(content.getBaseUrl());
120     } catch (MalformedURLException e) {
121       return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
122     }
123 
124     String text = "";
125     String title = "";
126     Outlink[] outlinks = new Outlink[0];
127     Metadata metadata = new Metadata();
128 
129     // parse the content
130     DocumentFragment root;
131     try {
132       byte[] contentInOctets = content.getContent();
133       InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
134 
135       EncodingDetector detector = new EncodingDetector(conf);
136       detector.autoDetectClues(content, true);
137       detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
138       String encoding = detector.guessEncoding(content, defaultCharEncoding);
139 
140       metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
141       metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
142 
143       input.setEncoding(encoding);
144       if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
145       root = parse(input);
146     } catch (IOException e) {
147       return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
148     } catch (DOMException e) {
149       return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
150     } catch (SAXException e) {
151       return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
152     } catch (Exception e) {
153       e.printStackTrace(LogUtil.getWarnStream(LOG));
154       return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
155     }
156       
157     // get meta directives
158     HTMLMetaProcessor.getMetaTags(metaTags, root, base);
159     if (LOG.isTraceEnabled()) {
160       LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
161     }
162     // check meta directives
163     if (!metaTags.getNoIndex()) {               // okay to index
164       StringBuffer sb = new StringBuffer();
165       if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); }
166       utils.getText(sb, root);          // extract text
167       text = sb.toString();
168       sb.setLength(0);
169       if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); }
170       utils.getTitle(sb, root);         // extract title
171       title = sb.toString().trim();
172     }
173       
174     if (!metaTags.getNoFollow()) {              // okay to follow links
175       ArrayList<Outlink> l = new ArrayList<Outlink>();   // extract outlinks
176       URL baseTag = utils.getBase(root);
177       if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); }
178       utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
179       outlinks = l.toArray(new Outlink[l.size()]);
180       if (LOG.isTraceEnabled()) {
181         LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl());
182       }
183     }
184     
185     ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
186     if (metaTags.getRefresh()) {
187       status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
188       status.setArgs(new String[] {metaTags.getRefreshHref().toString(),
189         Integer.toString(metaTags.getRefreshTime())});      
190     }
191     ParseData parseData = new ParseData(status, title, outlinks,
192                                         content.getMetadata(), metadata);
193     ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), 
194                                                  new ParseImpl(text, parseData));
195 
196     // run filters on parse
197     ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, 
198                                                              metaTags, root);
199     if (metaTags.getNoCache()) {             // not okay to cache
200       for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) 
201         entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, 
202                                                       cachingPolicy);
203     }
204     return filteredParse;
205   }
206 
207   private DocumentFragment parse(InputSource input) throws Exception {
208     if (parserImpl.equalsIgnoreCase("tagsoup"))
209       return parseTagSoup(input);
210     else return parseNeko(input);
211   }
212   
213   private DocumentFragment parseTagSoup(InputSource input) throws Exception {
214     HTMLDocumentImpl doc = new HTMLDocumentImpl();
215     DocumentFragment frag = doc.createDocumentFragment();
216     DOMBuilder builder = new DOMBuilder(doc, frag);
217     org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
218     reader.setContentHandler(builder);
219     reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
220     reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
221     reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
222     reader.parse(input);
223     return frag;
224   }
225   
226   private DocumentFragment parseNeko(InputSource input) throws Exception {
227     DOMFragmentParser parser = new DOMFragmentParser();
228     try {
229       parser.setFeature("http://cyberneko.org/html/features/augmentations",
230               true);
231       parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
232               defaultCharEncoding);
233       parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",
234               true);
235       parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
236               false);
237       parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
238               true);
239       parser.setFeature("http://cyberneko.org/html/features/report-errors",
240               LOG.isTraceEnabled());
241     } catch (SAXException e) {}
242     // convert Document to DocumentFragment
243     HTMLDocumentImpl doc = new HTMLDocumentImpl();
244     doc.setErrorChecking(false);
245     DocumentFragment res = doc.createDocumentFragment();
246     DocumentFragment frag = doc.createDocumentFragment();
247     parser.parse(input, frag);
248     res.appendChild(frag);
249     
250     try {
251       while(true) {
252         frag = doc.createDocumentFragment();
253         parser.parse(input, frag);
254         if (!frag.hasChildNodes()) break;
255         if (LOG.isInfoEnabled()) {
256           LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
257         }
258         res.appendChild(frag);
259       }
260     } catch (Exception x) { x.printStackTrace(LogUtil.getWarnStream(LOG));};
261     return res;
262   }
263   
264   public static void main(String[] args) throws Exception {
265     //LOG.setLevel(Level.FINE);
266     String name = args[0];
267     String url = "file:"+name;
268     File file = new File(name);
269     byte[] bytes = new byte[(int)file.length()];
270     DataInputStream in = new DataInputStream(new FileInputStream(file));
271     in.readFully(bytes);
272     Configuration conf = NutchConfiguration.create();
273     HtmlParser parser = new HtmlParser();
274     parser.setConf(conf);
275     Parse parse = parser.getParse(
276             new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
277     System.out.println("data: "+parse.getData());
278 
279     System.out.println("text: "+parse.getText());
280     
281   }
282 
283   public void setConf(Configuration conf) {
284     this.conf = conf;
285     this.htmlParseFilters = new HtmlParseFilters(getConf());
286     this.parserImpl = getConf().get("parser.html.impl", "neko");
287     this.defaultCharEncoding = getConf().get(
288         "parser.character.encoding.default", "windows-1252");
289     this.utils = new DOMContentUtils(conf);
290     this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
291         Nutch.CACHING_FORBIDDEN_CONTENT);
292   }
293 
294   public Configuration getConf() {
295     return this.conf;
296   }
297 }
posted on 2014-06-12 11:05 沧海一述苦笑天阅读(238) 评论(0) 收藏举报
刷新页面返回顶部
沧海一述苦笑天

HTMLPARSER JAVA CODE

导航

公告