mirror of
https://github.com/cambecc/air.git
synced 2025-12-08 21:26:22 +00:00
109 lines
3.7 KiB
JavaScript
109 lines
3.7 KiB
JavaScript
'use strict';
|
|
|
|
var when = require('when');
|
|
var http = require('http');
|
|
var htmlparser = require('htmlparser');
|
|
|
|
/**
|
|
* Converts the provided HTML text into a dom.
|
|
*
|
|
* @param {string} text
|
|
* @returns {Object} object representing the dom
|
|
*/
|
|
exports.parseHTML = function(text) {
|
|
var handler = new htmlparser.DefaultHandler(null, {verbose: false, ignoreWhitespace: true});
|
|
new htmlparser.Parser(handler).parseComplete(text);
|
|
return handler.dom;
|
|
}; var parseHTML = exports.parseHTML;
|
|
|
|
/**
|
|
* Returns all <table> tags contained in the provided dom as elements in an array.
|
|
*
|
|
* @param {Object} dom a parse tree obtained from calling the parseHTML function.
|
|
* @returns {Array} an array of all tables and their associated sub trees.
|
|
*/
|
|
exports.tablesOf = function(dom) {
|
|
return htmlparser.DomUtils.getElements({tag_type: 'tag', tag_name: 'table'}, dom);
|
|
}
|
|
|
|
/**
|
|
* Returns all text nodes contained in the provided dom as elements in an array.
|
|
*
|
|
* @param {Object} dom a parse tree obtained from calling the parseHTML function.
|
|
* @returns {Array} a flattened array of all text nodes.
|
|
*/
|
|
exports.textsOf = function(dom) {
|
|
return htmlparser.DomUtils.getElements({tag_type: 'text'}, dom);
|
|
}; var textsOf = exports.textsOf;
|
|
|
|
/**
|
|
* Returns all <tr> tags contained in the provided dom, presumably a tree rooted with a table node.
|
|
*
|
|
* @param {Object} dom a parse tree obtained from calling the parseHTML function.
|
|
* @returns {Array} an array of all rows and their associated sub trees.
|
|
*/
|
|
exports.rowsOf = function(dom) {
|
|
return htmlparser.DomUtils.getElements({tag_type: 'tag', tag_name: 'tr'}, dom);
|
|
}; var rowsOf = exports.rowsOf;
|
|
|
|
/**
|
|
* Given an html table comprised of rows having the <tr> tag, return a two-dimensional array of all cell values.
|
|
*
|
|
* @param {Object} table a parse tree obtained from calling the parseHTML function.
|
|
* @returns {Array} an array of rows, each row being an array of trimmed text values.
|
|
*/
|
|
exports.extract = function(table) {
|
|
return rowsOf(table).map(function(row) {
|
|
return textsOf(row).map(function(text) {
|
|
return text.data.trim();
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Returns the match results of all text nodes in the provided dom, satisfying the specified regex, as elements
|
|
* in an array.
|
|
*
|
|
* @param regex a regular expression.
|
|
* @param {Object} dom a parse tree obtained from calling the parseHTML function.
|
|
* @returns {Array} an array of regex match results.
|
|
*/
|
|
exports.matchText = function(regex, dom) {
|
|
var results = [];
|
|
function matchForRegex(data) {
|
|
var match = data.match(regex);
|
|
return match ? results.push(match) : false;
|
|
}
|
|
htmlparser.DomUtils.getElements({tag_type: 'text', tag_contains: matchForRegex}, dom);
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* Performs an http GET and parses the HTML into a dom. The result is a promise for the dom.
|
|
*
|
|
* @param options same as those taken by the http.request method.
|
|
* @param [converter] a callback that takes a buffer and converts it to another format.
|
|
* @returns {promise} a promise for the parsed dom of the specified url
|
|
*/
|
|
exports.fetch = function(options, converter) {
|
|
converter = converter || function nop(buffer) { return buffer; };
|
|
var d = when.defer();
|
|
console.log('get: ' + options);
|
|
http.get(options, function(response) {
|
|
var chunks = [];
|
|
response.on('data', function(chunk) {
|
|
chunks.push(chunk);
|
|
});
|
|
response.on('end', function() {
|
|
console.log('got: ' + options);
|
|
var converted = converter(Buffer.concat(chunks));
|
|
var parsed = parseHTML(converted);
|
|
console.log('done: ' + options);
|
|
d.resolve(parsed);
|
|
});
|
|
}).on('error', function(error) {
|
|
d.reject(error);
|
|
});
|
|
return d.promise;
|
|
}
|