<?php
/**
* a bare page class
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @var string link
* @var string title
* @var string content
*/
class page {
var $link;
var $title;
var $content;
function __construct($link) {
$this->link = $link;
$this->title = $title;
$this->content = $content;
}
function get_title() {
return $this->title;
}
}
/**
* a class that represents a page which is found by the search engine
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @var float $score
* @var string $link_hl highlighted link
*/
class page_found extends page {
private $score;
private $link_hl;
function __construct($link, $title, $content, $score) {
$this->link = $link;
$this->title = $title;
$this->content = $this->cut($content);
$this->score = sprintf('%.2f', $score);
}
/**
* Cuts a string (usually $content to a maximum of $width characters;
* cuts the string after a word
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $input
* @return string $output
*/
function cut($input) {
$width=200;
if (strlen($input) > $width) {
$input = substr_replace($input, 'xxxyyy', $width);
$input = ereg_replace(' [^ ]*xxxyyy', ' ...', $input);
}
return $input;
}
/**
* Show the current page object human readable
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return string $output
*/
function show() {
$output = '<p>';
//$output .= "<strong>".$this->title . "</strong><br />\n";
$output .= "<a href=\"". $this->link ."\">".$this->title . "</a><br />\n";
$output .= $this->content . "<br />\n";
$output .= "<i>URL: ". $this->link_hl . "</i><br />\n";
$output .= "Score: ".$this->score . "</p>\n";
return $output;
}
function get_score() {
return $this->score;
}
/**
* Highlights all elements of the current page object (link, title, content)
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $query to highlight in each element
* @return boolean true
*/
function highlight_all($query) {
// highlight title
$this->title = $this->highlight($this->title, $query);
// highlight content
$this->content = $this->highlight($this->content, $query);
//highlight link
$this->link_hl = $this->highlight($this->link, $query);
return true;
}
/**
* Highlights the query term in a specific string
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $string to go through
* @param string $query term
* @return string $string highlighted string
*/
function highlight($string, $query) {
$query = preg_replace('# +#', '|', preg_quote($query));
//echo $query;
// query steht mitten im Satz
$pattern = '/([ \'\"\.\/\(\)\-\_\@]+)('.$query.')([ \'\"\.\/\(\)\-\_\@]+)/i';
$repl = '$1<strong>$2</strong>$3';
$string = preg_replace($pattern, $repl, $string);
// query steht ganz am Anfang
$pattern = '/^('.$query.')([ \'\"\.\/\(\)\-\_\@]+)/i';
$repl = '<strong>$1</strong>$2';
$string = preg_replace($pattern, $repl, $string);
// query steht ganz am Ende
$pattern = '/([ \'\"\.\/\(\)\-\_\@]+)('.$query.')$/i';
$repl = '$1<strong>$2</strong>';
$string = preg_replace($pattern, $repl, $string);
return $string;
}
}
/**
* A page class which is needed to add a specific page to the index
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @var string $raw_content the whole content of a specific page
*/
class page_add extends page {
private $raw_content;
/**
* Generates a new page_add object and the methods calls pick_content() and
* pick_title() to extract title and content area
* from a given address
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $link
* @return boolean true
*/
function page_add($link) {
// Construktor
$this->link = $link;
$this->raw_content = file_get_contents($link);
$this->pick_title();
$this->pick_content();
return true;
}
/**
* Picks the text between the tags <!-- content_start --> and
* <!-- content_ende --> from the raw source code of the current
* website; removes HTML tags
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $link
* @return boolean true
*/
function pick_content() {
$input = $this->raw_content;
$input = preg_split("<!-- content_(start|ende) -->", $input);
$input = substr($input[1], 1);
//$input = preg_replace("<([^\>]*)>", "", $input);
$input = str_replace('>>', '', $input);
$input = strip_tags($input);
$input = str_replace("\n", " \n", $input);
$input = strip_tags($input);
$input = str_replace("\n", ' ', $input);
$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
$input = str_replace('>>', '', $input);
$input = str_replace("\t", '', $input);
$input = str_replace("\r", '', $input);
$input = str_replace("\0", '', $input);
$input = str_replace("\x0B", '', $input);
$input = ereg_replace('[ ]{2,}', ' ', $input);
$input = trim($input);
$this->content = $input;
return true;
//unset($this->raw_content);
}
/**
* Picks the text between the tags <!-- content_start --> and
* <!-- content_ende --> from the raw source code of the current
* website and returns this WITH HTML-TAGS;
* this is necessary to extract links
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return string $input rquired content
*/
function get_contentwtags(){
$input = $this->raw_content;
$input = preg_split("<!-- content_(start|ende) -->", $input);
$input = substr($input[1], 1);
return $input;
}
/**
* Extracts the text between the <title> tags of current document
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return boolean true
*/
function pick_title(){
$input = $this->raw_content;
$input = preg_split('<[//]?[Tt][Ii][Tt][Ll][Ee]>', $input);
$input = substr($input[1], 1);
$input = trim($input, '<');
$this->title = $input;
return true;
}
/**
* Returns (not prints!) the current page object human readably;
* content is NOT cut
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return string $output consisting of title, content, link
*/
public function show() {
$output = '<p>';
$output .= $this->title . "<br />\n";
$output .= $this->content . "<br />\n";
$output .= $this->link . "</p>\n\n";
return $output;
}
function get_title() {
return $this->title;
}
function get_link() {
return $this->link;
}
function get_content() {
return $this->content;
}
}
?>