Recently I needed a crawler script that would create a list of all pages on a single domain. As a part of that I wrote some functions that could download a page, extract all URLs from the HTML and turn them into absolute URLs (so that they themselves can be crawled later). Here’s the PHP code.
Extracting All Links From A Page
Here’s a function that will download the specified URL and extract all links from the HTML. It also translates relative URLs to absolute URLs, tries to remove repeated links and is overall a fine piece of code 🙂 Depending on your goal you may want to comment out some lines (e.g. the part that strips ‘#something’ (in-page links) from URLs).
Here’s a function that will download the specified URL and extract all links from the HTML. It also translates relative URLs to absolute URLs, tries to remove repeated links and is overall a fine piece of code 🙂 Depending on your goal you may want to comment out some lines (e.g. the part that strips ‘#something’ (in-page links) from URLs).
Â
function
crawl_page(
$page_url
,
$domain
) {
/* $page_url - page to extract links from, $domain -
   Â
crawl only this domain (and subdomains)
   Â
Returns an array of absolute URLs or false on failure.
*/
Â
/* I'm using cURL to retrieve the page */
   Â
$ch
= curl_init();
   Â
curl_setopt(
$ch
, CURLOPT_URL,
$page_url
);
   Â
curl_setopt(
$ch
, CURLOPT_RETURNTRANSFER,1);
   Â
curl_setopt(
$ch
, CURLOPT_FOLLOWLOCATION, 1);
Â
/* Spoof the User-Agent header value; just to be safe */
   Â
curl_setopt(
$ch
, CURLOPT_USERAGENT,
     Â
'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'
);
Â
/* I set timeout values for the connection and download
because I don't want my script to get stuck
downloading huge files or trying to connect to
a nonresponsive server. These are optional. */
   Â
curl_setopt(
$ch
, CURLOPT_CONNECTTIMEOUT, 10);
   Â
curl_setopt(
$ch
, CURLOPT_TIMEOUT, 15);
Â
/* This ensures 404 Not Found (and similar) will be
   Â
treated as errors */
   Â
curl_setopt(
$ch
, CURLOPT_FAILONERROR, true);
Â
/* This might/should help against accidentally
 Â
downloading mp3 files and such, but it
 Â
doesn't really work :/Â */
   Â
$header
[] =
"Accept: text/html, text/*"
;
   Â
curl_setopt(
$ch
, CURLOPT_HTTPHEADER,
$header
);
Â
/* Download the page */
   Â
$html
= curl_exec(
$ch
);
   Â
curl_close(
$ch
);
Â
   Â
if
(!
$html
)
return
false;
Â
/* Extract the BASE tag (if present) for
 Â
relative-to-absolute URL conversions later */
   Â
if
(preg_match(
'/ ]+)[\'\" >]/i'
,
$html
,
$matches
)){
       Â
$base_url
=
$matches
[1];
   Â
}
else
{
       Â
$base_url
=
$page_url
;
   Â
}
Â
   Â
$links
=
array
();
Â
   Â
$html
=
str_replace
(
"\n"
,
' '
,
$html
);
/* this regexp is a combination of numerous
   Â
versions I saw online; should be good. */
Â
   Â
foreach
(
$m
[2]
as
$url
) {
       Â
$url
=trim(
$url
);
Â
       Â
/* get rid of PHPSESSID, #linkname, & and javascript: */
       Â
$url
=preg_replace(
           Â
array
(
'/([\?&]PHPSESSID=\w+)$/i'
,
'/(#[^\/]*)$/i'
,
'/&/'
,
'/^(javascript:.*)/i'
),
           Â
array
(
''
,
''
,
'&'
,
''
),
           Â
$url
);
Â
       Â
/* turn relative URLs into absolute URLs.
         Â
relative2absolute() is defined further down
         Â
below on this page. */
           Â
$url
= relative2absolute(
$base_url
,
$url
);Â Â Â
Â
           Â
// check if in the same (sub-)$domain
           Â
if
(preg_match(
"/^http[s]?:\/\/[^\/]*"
.
str_replace
(
'.'
,
'\.'
,
$domain
).
"/i"
,
$url
)) {
               Â
//save the URL
               Â
if
(!in_array(
$url
,
$links
))
$links
[]=
$url
;
           Â
}
   Â
}
Â
   Â
return
$links
;
}
 How To Translate a Relative URL to an Absolute URL
This script is based on a function I found on the web with some small but significant changes.
Â
function
relative2absolute(
$absolute
,
$relative
) {
       Â
$p
= @
parse_url
(
$relative
);
       Â
if
(!
$p
) {
           Â
//$relative is a seriously malformed URL
           Â
return
false;
       Â
}
       Â
if
(isset(
$p
[
"scheme"
]))
return
$relative
;
Â
       Â
$parts
=(
parse_url
(
$absolute
));
Â
       Â
if
(
substr
(
$relative
,0,1)==
'/'
) {
           Â
$cparts
= (
explode
(
"/"
,
$relative
));
           Â
array_shift
(
$cparts
);
       Â
}
else
{
           Â
if
(isset(
$parts
[
'path'
])){
                Â
$aparts
=
explode
(
'/'
,
$parts
[
'path'
]);
                Â
array_pop
(
$aparts
);
                Â
$aparts
=
array_filter
(
$aparts
);
           Â
}
else
{
                Â
$aparts
=
array
();
           Â
}
          Â
$rparts
= (
explode
(
"/"
,
$relative
));
          Â
$cparts
=
array_merge
(
$aparts
,
$rparts
);
          Â
foreach
(
$cparts
as
$i
=>
$part
) {
               Â
if
(
$part
==
'.'
) {
                   Â
unset(
$cparts
[
$i
]);
               Â
}
else
if
(
$part
==
'..'
) {
                   Â
unset(
$cparts
[
$i
]);
                   Â
unset(
$cparts
[
$i
-1]);
               Â
}
           Â
}
       Â
}
       Â
$path
= implode(
"/"
,
$cparts
);
Â
       Â
$url
=
''
;
       Â
if
(
$parts
[
'scheme'
]) {
           Â
$url
=
"$parts[scheme]://"
;
       Â
}
       Â
if
(isset(
$parts
[
'user'
])) {
           Â
$url
.=
$parts
[
'user'
];
           Â
if
(isset(
$parts
[
'pass'
])) {
               Â
$url
.=
":"
.
$parts
[
'pass'
];
           Â
}
           Â
$url
.=
"@"
;
       Â
}
       Â
if
(isset(
$parts
[
'host'
])) {
           Â
$url
.=
$parts
[
'host'
].
"/"
;
       Â
}
       Â
$url
.=
$path
;
Â
       Â
return
$url
;
}