<?php require('dbclass.php');
class Harvest_Keywords {
var $_db; var $_urlarray; var $_stopwords = array ('and', 'but', 'are', 'the'); var $_allowwords = array ('c++', 'ado', 'vb');
function Harvest_Keywords($urls) { $this->_db = new DB_Class('test', 'username', 'password'); $this->_urlarray = trim ($urls); $this->_urlarray = explode ("\n", $this->_urlarray); }
function _prune (&$item, $key, $array) { $item = strtolower ($item); if (((preg_match ("/[^a-z0-9'\?!-]/", $item)) || (strlen ($item) < 3) || (in_array($item, $this->_stopwords))) && (!in_array($item, $this->_allowwords))) {
unset($array[$key]); } else { $item = addslashes(preg_replace("/[^a-z0-9'-]/i", '', $item)); } }
function _checkURL($url) { return preg_match ("/http:\/\/(.*)\.(.*)/i", $url); }
function _getData($url) { $filehandle = @fopen($url, 'r'); if(!$filehandle) { echo "Could not open URL ($url).<br />\n"; $return = FALSE; } else { $data = fread($filehandle, 25000); fclose($filehandle); $data = strip_tags ($data); $data = str_replace('&nbsp;', ' ', $data); $return = $data; } return $return; }
function _harvest($url) { if(!$this->_checkURL($url)) { echo "URL is not valid ($url).<br />\n"; } elseif ($data = $this->_getData($url)) { $words = preg_split ("/[\s,.]+/", $data); array_walk ($words, array($this, '_prune'), &$words); sort ($words); $url_id = $this->_db->getone("SELECT id FROM urls " . "WHERE url='$url'"); if($url_id) { $this->_db->query("DELETE FROM keywords " . "WHERE url_id=$url_id"); } else { $this->_db->query("INSERT INTO urls SET url='$url'"); $url_id = mysql_insert_id(); } $values = "($url_id, '$words[0]')"; $numwords = count ($words); for ($i = 1; $i < $numwords; $i++) { $values .= ", ($url_id, '$words[$i]')"; } $this->_db->query("INSERT INTO keywords VALUES $values"); } }
function process() { foreach($this->_urlarray as $url) { $this->_harvest($url); } } } ?> |