<?php
require_once('Zend/Search/Lucene.php');
class search_index {
private $path;
private $query;
private $query4index;
private $existing = array();
private $results = array();
private $index_size;
private $number_results;
private $index_set;
private $added_pages;
private $total;
private $segment_size = 100; // (save every n pages)
private $char_old = array('ä', 'ä', 'Ä', 'Ä', 'ö', 'ö', 'Ö', 'Ö', 'ü', 'ü', 'Ü', 'ü', 'ß', 'ß', 'à', 'à', 'é', 'é', 'ë', 'ë', 'ñ', 'ñ', '&', '&', '°', '°', 'ç', 'ç', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=');
private $char_new = array('ae', 'ae', 'Ae', 'Ae', 'oe', 'oe', 'Oe', 'Oe', 'ue', 'ue', 'Ue', 'Ue', 'ss', 'ss', 'ag', 'ag', 'ea', 'ea', 'ee', 'ee', 'n', 'n', 'und', 'und', 'g', 'g', 'c', 'c', 'null', 'eins', 'zwei', 'drei', 'vier', 'fuenf', 'sechs', 'sieben', 'acht', 'neun', 'gleich');
function __construct($path) {
// when creating an index
$this->path = $path;
}
/**
* Creates a new index
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $list[] Array of formatted addresses
* @return boolean True
*/
function create($list) {
// Create new Index
$this->total = count($list);
echo "<br />" . $this->total . " Elements to add<br />\n<hr />\n";
$update = false;
$this->added_pages = 0;
foreach ($list as $url) {
if (($this->added_pages % $this->segment_size) == 0) {
// Neues Segment
if ($update == true) {
// aktualisiere Index
$this->index_set = new Zend_Search_Lucene($this->path);
} else {
// neuen Index erstellen
$this->index_set = new Zend_Search_Lucene($this->path, true);
$update = true;
}
}
// Was tun mit dem aktuellen Segment?
if (!@file_get_contents($url)) {
// page doesn't exist
echo "<br />" . $url . "<br />\n doesn't exist!<br />\n";
echo "<hr />\n";
$missing[] = $url;
} else {
$this->add2index($url);
usleep(500000); // wait 0.5 sec.
echo "<br />\n<hr />\n";
}
}
// in the end, write changes to the index
$this->index_set->commit();
echo "<b>Saving index.</b><br />\n<b>Finished.</b><br />\n";
return true;
}
/**
* Appends a list of addresses to the index
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $list[] Array of formatted addresses
* @return boolean True
*/
function append($list) {
$this->total = count($list);
echo "<br />" . $this->total . " Elements to add<br />\n<hr />\n";
$update = true;
// get existing sites
$this->get_all();
$this->added_pages = 0;
//var_dump($list);
foreach ($list as $url) {
if (($this->added_pages % $this->segment_size) == 0) {
// Neues Segment
if ($update == true) {
// aktualisiere Index
$this->index_set = new Zend_Search_Lucene($this->path);
} else {
// neuen Index erstellen
$this->index_set = new Zend_Search_Lucene($this->path, true);
$update = true;
}
}
// Was tun mit dem aktuellen Segment?
//if (in_array($url, $this->existing)) {
if ($this->exists($url)) {
// already existing
echo "<br />" . $url . "<br />\n is already in the index!<br />\n";
echo "<hr />\n";
} elseif (!@file_get_contents($url)) {
// page doesn't exist
echo "<br />" . $url . "<br />\n doesn't exist!<br />\n";
echo "<hr />\n";
$missing[] = $url;
} else {
$this->add2index($url);
usleep(500000); // wait 0.5 sec.
echo "<br />\n<hr />\n";
}
}
// in the end, write changes to the index
$this->index_set->commit();
echo "<b>Saving index.</b><br />\n<b>Finished.</b><br />\n";
}
/**
* Checks if an address already exists
* (if address is http://mysite.com/page.php check also
* if http://www. .... exists
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $list[] Array of formatted addresses
* @return boolean True if address exists
* @return boolean False if address doesn't exist
*/
function exists($url) {
$url = strtolower($url);
// prüfe, ob eine URL im Array existing ist
if (in_array($url, $this->existing)) {
// GENAU die gleiche URL ist bereits vorhanden
return true;
} else {
// überprüfe, ob die gleiche URL, aber mit/ohne "www" vorhanden ist
if (preg_match('/^http:\/\/www\./i', $url) == 1) {
// URL enthält ein "www", also:
// überprüfe, ob die gleiche URL ohne "www" vorhanden ist
$url = preg_replace('/^http:\/\/www\./i', 'http://', $url);
if (in_array($url, $this->existing)) {
return true;
} else {
return false;
}
} else {
// URL enthält KEIN "www", also:
// überprüfe, ob die gleiche URL MIT "www" vorhanden ist
$url = preg_replace('/^http:\/\//i', 'http://www.', $url);
if (in_array($url, $this->existing)) {
return true;
} else {
return false;
}
}
}
}
/**
* Adds a specific address it gets from the create() or append() method
* to the index, and if necessary, saves the current segment to the index
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $url the address of the page to add
* @return boolean True
*/
function add2index($url) {
// add to index
$doc = new Zend_Search_Lucene_Document();
$cur_page = new page_add($url);
$item['headline'] = $this->simplify($cur_page->get_title());
$item['contents'] = $this->simplify($cur_page->get_content());
$item['link'] = $this->simplify($cur_page->get_link());
foreach ($item as $key => $value) {
// indiziere Komponente
$doc -> addField(Zend_Search_Lucene_Field::Text($key, $value ));
}
echo $cur_page->show();
$this->index_set->addDocument($doc);
$this->added_pages++;
echo $this->added_pages . " / " . $this->total . "<br />\n";
if (($this->added_pages % $this->segment_size) == 0) {
// Index speichern
$this->index_set->commit();
echo "<b>Saving index.</b><br />\n";
}
return true;
}
/**
* Set query term
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $term query term
* @return boolean True
*/
function set_query($term) {
$term = trim($term);
$this->query = $term;
return true;
}
/**
* Returns query term
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return string query
*/
function get_query() {
return $this->query;
}
/**
* Lower all letters, even umlauts.
* used for getting a correct query consisting only of lowercase letters
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string string to lower
* @return string string
*/
function lower_all($string) {
$string = strtolower($string);
$upper = array('Ä', 'Ö', 'Ü');
$lower = array('ä', 'ö', 'ü');
$string = str_replace($upper, $lower, $string);
return $string;
}
/**
* Arranges a term for the query in the index
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string query term
* @return boolean true
*/
function set_query4index($query) {
// Suchabfrage, die für den Index bestimmt ist
//$query = $this->simplify($this->lower_all($query));
$query = 'headline:' . $query . ' contents:' . $query. ' link:' . $query;
$query = $this->simplify($this->lower_all($query));
//$query4 = new Zend_Search_Lucene_Search_Query_Phrase(array('word1', 'word2'), NULL, 'annotation');
$this->query4index = $query;
return true;
}
/**
* Returns the query string which is intended for the search in the index
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return string $query4index
*/
function get_query4index() {
return $this->query4index;
}
/**
* Searches for a query in the index, saves results to $this->results[];
* calls method show_results();
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @throws Zend_Search_Lucene_Exception
* @param string $query
* @return boolean true
*/
function search($query) {
// Suchbegriff für die Ausgabe am Bildschirm und zum Highlighten
$this->set_query($query);
// Suchbegriff, der für den Index bestimmt ist
$this->set_query4index($this->get_query());
try {
if ((preg_match('/\//', $query)==1) || (preg_match('/\?/', $query)==1)) {
throw new Zend_Search_Lucene_Exception('No special characters please.', 1);
} elseif (strlen($query) < 3) {
throw new Zend_Search_Lucene_Exception('length', 2);
}
$index = new Zend_Search_Lucene($this->path);
$hits = $index->find($this->get_query4index());
//$hits = $index->find('link:+die +der');
//echo "Index contains ".$index->count()." documents.<br />\n\n";
$this->index_size = $index->count();
// Only return MAX_RESULTS results
$i = 1;
foreach ($hits as $hit) {
if ($i <= MAX_RESULTS) {
$document = $hit->getDocument();
$document->getField('link');
$document->getField('headline');
$document->getField('contents');
$title = $this->unsimplify($document->headline);
$link = $this->unsimplify($document->link);
$content = $this->unsimplify($document->contents);
$score = $hit->score;
$result = new page_found($link, $title, $content, $score);
$this->results[] = $result;
$i++;
}
else {
break;
}
}
$this->show_results();
return true;
} catch(Zend_Search_Lucene_Exception $e) {
if ($e->getMessage() == "length") {
echo "<p>Your term must have at least 3 characters!</p>\n";
} else {
echo "<p>Please enter a valid term!</p>\n";
echo $e->getMessage();
}
echo '<a href="javascript:history.back()">go back</a>';
}
}
/**
* Returns the number fo results. Once there are more than MAX_RESULTS, it returns the String
* "more than MAX_RESULTS"
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return Int/String Number
*/
function getNumberOfResults(){
if ($this->number_results > MAX_RESULTS) {
return "more than ".MAX_RESULTS;
}
else {
return $this->number_results;
}
}
/**
* Prints number oof elements in the index, number of results, results
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return boolean true
*/
function show_results() {
echo "Index contains ".$this->index_size." documents.<br />\n\n";
if ($z != 1) $word = "hits";
else $word = "hit";
echo "Search for '".$this->query."' returned " .$this->getNumberOfResults()." ".$word. "<br />\n\n";
foreach ($this->results as $result) {
$result->highlight_all($this->query);
echo $result->show();
}
return true;
}
/**
* Gets all elements in the index and saves them to $this->existing
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @return boolean true
*/
function get_all() {
$query = 'http';
$query = $this->simplify(strtolower($query));
$index = new Zend_Search_Lucene($this->path);
$hits = $index->find('link:' . $query);
foreach ($hits as $hit) {
$document = $hit->getDocument();
$document->getField('link');
$link = $this->unsimplify($document->link);
$this->existing[] = strtolower($link);
}
return true;
}
/*
function flush() {
// empty the index IRREPARABLY
$files = scandir($this->path);
foreach ($files as $file) {
$item = $path . '/' . $file;
if ($file!= '.' && $file != '..') {
if(unlink($item)) {
echo "Removing " . $item . "<br />\n";
} else {
echo "Could not remove " . $item . "<br />\n";
}
}
}
}*/
/**
* Arranges a string for a query in the index, i.e. encode umlauts,
* encode to ascii charset
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $string to arrange
* @return string $string arranged string
*/
function simplify($string) {
// Bereite String für Zugriff auf Index auf
foreach ($this->char_old as $key => $value) {
$string = str_replace($value, 'xxx' . $this->char_new[$key] . 'xxx', $string);
}
$string = iconv('UTF-8', 'ASCII//TRANSLIT', $string);
//echo $string;
return $string;
}
/**
* Arranges a string human readable and browser friendly
*
* @author Natalie Kather <natalie.kather@northclick.de>
* @param string $string to arrange
* @return string $string arranged string
*/
function unsimplify($string) {
// Wandle String vom Index-Format in normales (Web-)Format um
$string = iconv('ASCII//TRANSLIT', 'UTF-8', $string);
foreach ($this->char_new as $key => $value) {
$string = str_replace('xxx' . $value . 'xxx', $this->char_old[$key], $string);
}
return $string;
}
}
?>