1 /***
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.nutch.parse.html;
19
20 import java.util.ArrayList;
21 import java.util.Map;
22 import java.net.URL;
23 import java.net.MalformedURLException;
24 import java.nio.charset.Charset;
25 import java.io.*;
26 import java.util.regex.*;
27
28 import org.cyberneko.html.parsers.*;
29 import org.xml.sax.InputSource;
30 import org.xml.sax.SAXException;
31 import org.w3c.dom.*;
32 import org.apache.html.dom.*;
33
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36
37 import org.apache.nutch.metadata.Metadata;
38 import org.apache.nutch.metadata.Nutch;
39 import org.apache.nutch.protocol.Content;
40 import org.apache.hadoop.conf.*;
41 import org.apache.nutch.parse.*;
42 import org.apache.nutch.util.*;
43
44 public class HtmlParser implements Parser {
45 public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.html");
46
47 // I used 1000 bytes at first, but found that some documents have
48 // meta tag well past the first 1000 bytes.
49 // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
50 private static final int CHUNK_SIZE = 2000;
51 private static Pattern metaPattern =
52 Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
53 Pattern.CASE_INSENSITIVE);
54 private static Pattern charsetPattern =
55 Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
56 Pattern.CASE_INSENSITIVE);
57
58 private String parserImpl;
59
60 /***
61 * Given a <code>byte[]</code> representing an html file of an
62 * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
63 * from the first <code>CHUNK_SIZE</code> bytes.
64 * If there's no meta tag for Content-Type or no charset is specified,
65 * <code>null</code> is returned. <br />
66 * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
67 * can't be handled with this.
68 * We need to do something similar to what's done by mozilla
69 * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
70 * See also http://www.w3.org/TR/REC-xml/#sec-guessing
71 * <br />
72 *
73 * @param content <code>byte[]</code> representation of an html file
74 */
75
76 private static String sniffCharacterEncoding(byte[] content) {
77 int length = content.length < CHUNK_SIZE ?
78 content.length : CHUNK_SIZE;
79
80 // We don't care about non-ASCII parts so that it's sufficient
81 // to just inflate each byte to a 16-bit value by padding.
82 // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
83 // {U+0041, U+0082, U+00B7}.
84 String str = "";
85 try {
86 str = new String(content, 0, length,
87 Charset.forName("ASCII").toString());
88 } catch (UnsupportedEncodingException e) {
89 // code should never come here, but just in case...
90 return null;
91 }
92
93 Matcher metaMatcher = metaPattern.matcher(str);
94 String encoding = null;
95 if (metaMatcher.find()) {
96 Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
97 if (charsetMatcher.find())
98 encoding = new String(charsetMatcher.group(1));
99 }
100
101 return encoding;
102 }
103
104 private String defaultCharEncoding;
105
106 private Configuration conf;
107
108 private DOMContentUtils utils;
109
110 private HtmlParseFilters htmlParseFilters;
111
112 private String cachingPolicy;
113
114 public ParseResult getParse(Content content) {
115 HTMLMetaTags metaTags = new HTMLMetaTags();
116
117 URL base;
118 try {
119 base = new URL(content.getBaseUrl());
120 } catch (MalformedURLException e) {
121 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
122 }
123
124 String text = "";
125 String title = "";
126 Outlink[] outlinks = new Outlink[0];
127 Metadata metadata = new Metadata();
128
129 // parse the content
130 DocumentFragment root;
131 try {
132 byte[] contentInOctets = content.getContent();
133 InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
134
135 EncodingDetector detector = new EncodingDetector(conf);
136 detector.autoDetectClues(content, true);
137 detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
138 String encoding = detector.guessEncoding(content, defaultCharEncoding);
139
140 metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
141 metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
142
143 input.setEncoding(encoding);
144 if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
145 root = parse(input);
146 } catch (IOException e) {
147 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
148 } catch (DOMException e) {
149 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
150 } catch (SAXException e) {
151 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
152 } catch (Exception e) {
153 e.printStackTrace(LogUtil.getWarnStream(LOG));
154 return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
155 }
156
157 // get meta directives
158 HTMLMetaProcessor.getMetaTags(metaTags, root, base);
159 if (LOG.isTraceEnabled()) {
160 LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
161 }
162 // check meta directives
163 if (!metaTags.getNoIndex()) { // okay to index
164 StringBuffer sb = new StringBuffer();
165 if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); }
166 utils.getText(sb, root); // extract text
167 text = sb.toString();
168 sb.setLength(0);
169 if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); }
170 utils.getTitle(sb, root); // extract title
171 title = sb.toString().trim();
172 }
173
174 if (!metaTags.getNoFollow()) { // okay to follow links
175 ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
176 URL baseTag = utils.getBase(root);
177 if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); }
178 utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
179 outlinks = l.toArray(new Outlink[l.size()]);
180 if (LOG.isTraceEnabled()) {
181 LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl());
182 }
183 }
184
185 ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
186 if (metaTags.getRefresh()) {
187 status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
188 status.setArgs(new String[] {metaTags.getRefreshHref().toString(),
189 Integer.toString(metaTags.getRefreshTime())});
190 }
191 ParseData parseData = new ParseData(status, title, outlinks,
192 content.getMetadata(), metadata);
193 ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
194 new ParseImpl(text, parseData));
195
196 // run filters on parse
197 ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult,
198 metaTags, root);
199 if (metaTags.getNoCache()) { // not okay to cache
200 for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
201 entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY,
202 cachingPolicy);
203 }
204 return filteredParse;
205 }
206
207 private DocumentFragment parse(InputSource input) throws Exception {
208 if (parserImpl.equalsIgnoreCase("tagsoup"))
209 return parseTagSoup(input);
210 else return parseNeko(input);
211 }
212
213 private DocumentFragment parseTagSoup(InputSource input) throws Exception {
214 HTMLDocumentImpl doc = new HTMLDocumentImpl();
215 DocumentFragment frag = doc.createDocumentFragment();
216 DOMBuilder builder = new DOMBuilder(doc, frag);
217 org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
218 reader.setContentHandler(builder);
219 reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
220 reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
221 reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
222 reader.parse(input);
223 return frag;
224 }
225
226 private DocumentFragment parseNeko(InputSource input) throws Exception {
227 DOMFragmentParser parser = new DOMFragmentParser();
228 try {
229 parser.setFeature("http://cyberneko.org/html/features/augmentations",
230 true);
231 parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
232 defaultCharEncoding);
233 parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",
234 true);
235 parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
236 false);
237 parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
238 true);
239 parser.setFeature("http://cyberneko.org/html/features/report-errors",
240 LOG.isTraceEnabled());
241 } catch (SAXException e) {}
242 // convert Document to DocumentFragment
243 HTMLDocumentImpl doc = new HTMLDocumentImpl();
244 doc.setErrorChecking(false);
245 DocumentFragment res = doc.createDocumentFragment();
246 DocumentFragment frag = doc.createDocumentFragment();
247 parser.parse(input, frag);
248 res.appendChild(frag);
249
250 try {
251 while(true) {
252 frag = doc.createDocumentFragment();
253 parser.parse(input, frag);
254 if (!frag.hasChildNodes()) break;
255 if (LOG.isInfoEnabled()) {
256 LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
257 }
258 res.appendChild(frag);
259 }
260 } catch (Exception x) { x.printStackTrace(LogUtil.getWarnStream(LOG));};
261 return res;
262 }
263
264 public static void main(String[] args) throws Exception {
265 //LOG.setLevel(Level.FINE);
266 String name = args[0];
267 String url = "file:"+name;
268 File file = new File(name);
269 byte[] bytes = new byte[(int)file.length()];
270 DataInputStream in = new DataInputStream(new FileInputStream(file));
271 in.readFully(bytes);
272 Configuration conf = NutchConfiguration.create();
273 HtmlParser parser = new HtmlParser();
274 parser.setConf(conf);
275 Parse parse = parser.getParse(
276 new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
277 System.out.println("data: "+parse.getData());
278
279 System.out.println("text: "+parse.getText());
280
281 }
282
283 public void setConf(Configuration conf) {
284 this.conf = conf;
285 this.htmlParseFilters = new HtmlParseFilters(getConf());
286 this.parserImpl = getConf().get("parser.html.impl", "neko");
287 this.defaultCharEncoding = getConf().get(
288 "parser.character.encoding.default", "windows-1252");
289 this.utils = new DOMContentUtils(conf);
290 this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
291 Nutch.CACHING_FORBIDDEN_CONTENT);
292 }
293
294 public Configuration getConf() {
295 return this.conf;
296 }
297 }