<?PHP

switch ($_GET["source"])
{
case 1:
print(highlight_file("./index.php"));
exit;
case 2:
print(nl2br(htmlspecialchars(join('',file("./index.php")))));
exit;
case 3:
print(join('',file("./index.php")));
exit;
}

?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<?php print '<?xml version="1.0" encoding="iso-8859-1" ?>'; ?>
<html><head>

<title>Open Directory - Tools - Clean HTML</title>
<meta http-equiv="cache-control" content="no-cache" />
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
<meta http-equiv="pragma" content="no-cache" />
<meta http-equiv="expires" content="0" />
<link rel="stylesheet" type="text/css" href="http://dmoz.org/dmoz.css" />
</head>
<body bgcolor="#ffffff" text="#000000" link="#3300cc" vlink="#660066"
alink="#FF0000">
<table border="0" bgcolor="#669933" cellpadding="3" cellspacing="0" width="100%">
<tr>
<td align="left" valign="middle"> &nbsp; <a href="http://dmoz.org/"><img src="http://dmoz.org/img/odphead.gif" width="222" height="25" align="middle" border="0" alt="Open Directory Project" /></a></td></tr><tr><td bgcolor="#cccccc"
align="right"><a href="http://dmoz.org/">home</a> |
<a href="/">Author page</a> |
<a href="http://dmoz.org/editors/">editor login</a></td></tr>
</table>

<br />
<div align="center">
<table cellpadding="0" cellspacing="0" border="0"> <tr> <td><a href="http://www.tehospedo.com.br/hospedagem-de-sites/"><img src="http://www.tehospedo.com.br/img/promocoes/selo3.gif" alt="Hospedagem de sites" width="102" height="30" border="0" /></a><br /><a href="http://www.tehospedo.com.br/hospedagem-de-sites/" title="Hospedagem de sites" style="color: #ADADAD;text-decoration: none;font-size:10px;font-family: Arial, Helvetica, sans-serif;">Hospedagem de Sites</a></td></tr></table>
</div>

<h1>Clean HTML - Dmoz</h1>

<p>The purpose of this tool is simply get a output of urls and titles from a html fragment or a url because DMOZ URL cleaning engine (used on Add a page of links to unreviewed) sometimes cannot recognize the URLs inside a webpage and, instead of return all urls, it return only one or two urls. Test with both tools (official Dmoz and this one) the URL http://grandeminas.globo.com/unainet/index_jornais.htm. There are some <a href="#todo">improvements</a> to add on Clean HTML but its working.</p>

<h2>Usage</h2>
<p>Put the HTML Fragment or the URL. If you choice the ouput type URL and Titles you will get a html fragment that can be parsed by Dmoz official multilinks tool.</p>


<?PHP


/***************************************************************/
/* @license This script is LGPL. You can get a
/* copy of this license at
/* http://www.gnu.org/
/*
/* @author Roberto Berto <darkelder@php.net>
/*
/* @description Parse HTML fragment and return only url
/* and titles.
/***************************************************************/

/*
* defining thigns
*/
define("ERROR_NO_HREF",-1);
define("ERROR_NO_TYPE",-2);
define("ERROR_URL_ERROR",-3);
define("ERROR_NONE",1);

define("TYPE_URL_TITLE",1);
define("TYPE_URL",2);

$text = $_POST["text"];

$preg = <<<EOF
/<a[^>]*href=["']?([^"'> ]*)[^>]*>(.*?)<\/a>/i
EOF;

define("URL_PREG",$preg);

set_magic_quotes_runtime(0);

if (!empty($_POST["url"]))
{
$urlextern = @file($_POST["url"]);
if ($urlextern == FALSE)
{
$errorcode = ERROR_URL_ERROR;
}
else
{
$text = join('',$urlextern);
}
}

if (!empty($text) && empty($errorcode))
{
$errorcode = clearhtml($text, $_POST["type"], $cleaned, $totalurl);
}

if ($errorcode == ERROR_NONE)
{
printf("<h2>Parsed html</h2><p>%d urls found.</p><form><textarea cols=\"60\" rows=\"20\">%s</textarea></form>",$totalurl,htmlentities($cleaned));
}

if ($errorcode < 0)
{
print "<font color=red><b>Error</b>: ";
switch ($errorcode)
{
case ERROR_NO_HREF:
print "Cannot get any url.";
break;
case ERROR_NO_TYPE:
print "Type parm invalid.";
break;
case ERROR_URL_ERROR:
print "Cannot open given URL.";
break;
}
print "</font>";
unset($text);
}

if (empty($text))
{
?>
<h2>Fill the form bellow to get parsed urls</h2>
<form action="index.php" method="post">
<b>URL:</b><br />
<small>Put a URL to get links here.</small><br />
<input type="text" size="40" name="url" value="<?PHP print $_POST["url"]; ?>" />
<br />
<b>HTML Fragment:</b><br />
<small>If you do not fillow URL field, put some HTML fragment with some &lt;a href...&gt; ... &lt;/a&gt; blocks.</small><br />

<textarea name="text" rows="20" cols="60"><?PHP print $_POST["text"]; ?></textarea>
<br />
<b>Type of output:</b><br />
<input type="radio" name="type" value="1" <?PHP if (empty($_POST["type"])) print 'checked="checked"'; ?> /> Urls and Titles (eg: &lt;a href=http://mysite.com&gt;My Site Title&lt;/a&gt;) <b>Default</b>.<br />
<input type="radio" name="type" value="2" /> Only Urls (eg: http://mysite.com)<br />
<br />
<input type="submit" value="Get Clean HTML" />
</form>
<?PHP
}

?>

<table width="100%" border="0">
<tr valign="top">
<td>
<h2>Author and Sourcecode</h2>
<p>This tool was built with PHP in LPGL license. You can read the sourcecode of clearhtml <a href="index.php?source=1">with highlights</a> and <a href="index.php?source=2">without highlights</a>. You can also <a href="index.php?source=3">download here</a> (click on save as)</p>
<p>The author is Roberto Berto (<a href="http://dmoz.org/profiles/darkelder.html">darkelder</a> at dmoz) or at <a href="/">his homepage</a>.</p>

<h2><a name="todo">To do list</a></h2>
<ul>
<li>Work with relative URLS (&lt;a href="home.html"&gt; ...).</li>
<li>Add a meta and URL title improviment.</li>
<li>Auto submit to Dmoz multilink.cgi the results (list like multilink confirm page: description, url, mark sites to visit, ODP note).</li>
<li>Add a redirect and broken links checker, like Dmoz multilink.cgi</li>
<li>Only externals links (all sites have internal links that arent relevant) -- optional</li>
</ul>

<h2>Changelog</h2>
<ul>
<li>2002-11-04 - Added url count and code cleaning.</li>
<li>2002-11-04 - If title or url is null, do not print it.</li>
<li>2002-11-04 - Changed preg_match to preg_match_all. One bug killed.</li>
<li>2002-11-04 - First release 1.0</li>
</ul>

<hr />
<p>
<a href="http://validator.w3.org/check/referer"><img
src="http://www.w3.org/Icons/valid-xhtml10"
alt="Valid XHTML 1.0!" height="31" width="88" /></a>
</p>
</td>

<td width="340">
<a href="http://www.tehospedo.com.br/hospedagem-de-sites/" tilte="Hospedagem de sites"><img src="http://www.tehospedo.com.br/img/banners/promocao_parceiros/banner_336x280.jpg" alt="Hospedagem de sites" width="336" height="280" border="0"/></a>
</td>
</tr>
</table>

<hr />

<p>
This site is kindly hosted by TeHospedo, check it <a href="http://www.tehospedo.com.br">hospedagem de sites</a> Linux e Windows.
</p>



<br /><br />

</body>
</html>

<?PHP

/**
* clear the html and return the text formated
*
* @author Roberto Berto <darkelder@php.net>
* @version 1.0
* @date 20021104
* @var string $text Long text with html source
* @var int $type Some of defined types
* @var string $cleaned Text cleaned passed by reference
* @return int Error code
*/
function clearhtml($text,$type = TYPE_URL_TITLE, &$cleaned, &$totalurl)
{

/*
* default is return ERROR_NO_HREF
*/
$errorcode = ERROR_NO_HREF;

if (preg_match_all(URL_PREG,$text,$re))
{
for ($x = 0; $x <= sizeof($re[1]); $x++)
{
$url = $re[1][$x];
$title = $re[2][$x];
/*
* cleaning double spaces, http://. (to http://) and other tags
*/
$title = preg_replace("/(\s\s)+/"," ",$title);
$url = preg_replace("/^http:\/\/\./","http://",$url);
$title = preg_replace("/<.*?>/","",$title);

if (!empty($title) && !empty($url))
{
$totalurl++;
/*
* select the output format and return it
*/
switch ($type)
{
case TYPE_URL:
$cleaned .= sprintf("%s\n",$url);
break;
case TYPE_URL_TITLE:
$cleaned .= sprintf('<a href="%s">%s</a>' . "\n\n",$url,$title);
break;
default:
$errorcode = ERROR_NO_TYPE;
}
}
}
}

/*
* returning error code
*/
if (!empty($cleaned))
{
return ERROR_NONE;
}
else
{
return $errorcode;
}
}
?>