<?php
/**
* This class is intended to manage a list of addresses (not page object).
* These adresses are provided by a specific location (sitemap or link list)
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @var string $location
* @var string $list[]
*/
class sites_list {
private $location;
private $list;
function __construct($location) {
$this->location = $location;
$this->create();
}
/**
* Creates a new list object; decides by the way of $location
* if its an XML (sitemap) or HTML document and calls the corresponding method
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return boolean true
*/
function create() {
//echo "Creating List from<br />\n" . $this->location . "<br />\n";
if (preg_match('/.xml/', $this->location)==1) {
// create list from Google XML-sitemap
$this->extract_xml();
} else {
//extract links from html document
$this->extract_html();
}
return true;
}
/**
* Extracts the addresses from an XML file provided by the
* Google sitemap generator; adds the addresses to the list
*
* @author Natalie Kather <natalie.kather@northclick.de>
*/
function extract_xml() {
$xmlstr = file_get_contents($this->location);
$xml = new SimpleXMLElement($xmlstr);
$sites = array();
foreach ($xml->url as $url) {
$tmp = sprintf($url->loc);
$sites[] = $tmp;
//echo $tmp."<br />\n";
}
$this->list = $sites;
//var_dump($this->list);
}
/**
* Extracts links from an HTML document within its content-area
* (see page_add::get_contentwtags) - which means that links from the
* navigation area are ignored - and filters unwelcome links
* such as mailto, anchor and javascript links.
* Adds the hostname to relative links, removes session ids
* and adds all addresses to the list
*
* @author Natalie Kather <natalie.kather@northclick.de>
*/
function extract_html() {
//$html = file_get_contents($this->location);
// Only grab links from the content area
$content = new page_add($this->location);
$html = $content->get_contentwtags();
//echo $html;
$urlpattern = '/<a[^>]+href="([^"]+)/i';
preg_match_all($urlpattern, $html, $matches);
$list_tmp = $matches[1];
// grab hostname from $location
$pattern = '/(http:\/\/[^\/]*\/).*/i';
preg_match_all($pattern, $this->location, $matches);
$host = $matches[1][0];
// append $host to relative links, filter mailto, anchor and javascrip links
$sites = array();
foreach ($list_tmp as $item) {
if (preg_match('/mailto/', $item)==0 && preg_match('/#/', $item)==0 && preg_match('/javascript/', $item)==0) {
if (preg_match('/http/', $item) == 0) {
// add hostname
$item = $host.$item;
}
// append to new array
$tmp = $item;
//echo $tmp . "<br />\n";
$sites[] = $tmp;
}
}
// remove session-IDs from the end(!) of URLs
// f.ex. http://ecc-handel.de/dienstleister_datenbank_detailansicht.php?dienstleisterID=1074779769&PHPSESSID=41aa2a515d1e276906dfb991c0c321c3
$sites_neu = array();
foreach ($sites as $item) {
if (preg_match('/PHPSESSID/', $item) == 1) {
$pattern = '/(http:\/\/[^\&]*)\&PHPSESSID=.*/i';
preg_match_all($pattern, $item, $matches);
$tmp = $matches[1][0];
} else {
$tmp = $item;
}
//echo $tmp."<br />\n";
$sites_neu[] = $tmp;
}
$this->list = $sites_neu;
}
/**
* Prints the whole list in a human readable way
*
* @author Natalie Kather <natalie.kather@northclick.de>
*/
function show_list() {
foreach ($this->list as $item) {
echo $item . "<br />\n";
}
}
function get_list() {
return $this->list;
}
function get_location() {
return $this->location;
}
}
?>