Although the function below is adequate for simple purposes. It lacks the advanced algorithm to craw websites. I’ll loopback to rewrite this thing when there’s a necessity for better tools.
# findDownloadUrl_v0.0.2.ps1
# This little program will parse a web page for it's downloadable contents
# The multi-threading code has been added
$startUrl='https :// somewebsite'
$fileExtension='.exe'
$maxDepth=3
function findDownloadUrl{
param(
$startUrl,
$fileExtension,
$maxDepth=3
)
$simultaneousJobs=8
$linksChecked=0
$firstResult=$false
$timer=[System.Diagnostics.Stopwatch]::StartNew()
if(!$startUrl){
write-warning "Cannot start with a blank parent URL"
}elseif($startUrl -notmatch '/$'){
$startUrl=$startUrl+'/'
}
function findFile($parentUrl,$extension){
$ProgressPreference='SilentlyContinue'
$ErrorActionPreference='stop'
if($parentUrl -notmatch '/$'){$parentUrl=$parentUrl+'/'}
try{
$page=Invoke-WebRequest $parentUrl -TimeoutSec 10
}catch{
return @{'result'=$false;'links'=@()}
}
$newLinks=$page.links.href|?{$_ -notlike "*$(Split-Path $parentUrl -parent)"}| `
sort -Descending|%{$(
if($_[0] -eq '/'){
$parentUrl+$_.Substring(1,$_.length-1)
}elseif($_ -match '^http'){
$_
}else{
$parentUrl+$_
}
)}|select -Unique
$matchedExtension=$newLinks|?{$_ -like "*$extension"}|sort -Descending|select -First 1
if($matchedExtension){
return @{'result'=$true;'links'=$matchedExtension}
}elseif($newLinks){
return @{'result'=$false;'links'=$newLinks}
}else{
return @{'result'=$false;'links'=@()}
}
}
write-host "Scanning $startUrl for file extension $fileExtension"
$startLinks=.{$result=findFile $startUrl $fileExtension
return $result['links']
}
if($startLinks -eq $null){
write-warning "There were problems parsing links"
return $null
}elseif($startLinks.gettype() -eq [string]){
return $startLinks
}
$knownLinks=$startLinks
foreach ($link in $startLinks){
$currentDepth=1
write-host "Processing link at current depth: $currentDepth"
$newLinks=@($link)
do{
if($i++ -lt $simultaneousJobs -and !(!$newLinks)){
$thisLink=$newLinks|Select -Unique|select -First 1
if($newLinks.count -gt 1){
$newLinks=$newLinks[1..($newLinks.count-1)]
}else{
$newLinks=@()
}
write-host "Parsing $thisLink"
$job=start-job -ScriptBlock{
param($findFile,$thisLink,$fileExtension)
return [ScriptBlock]::Create($findFile).invoke($thisLink,$fileExtension)
} -Args ${function:findFile},$thisLink,$fileExtension
$linksChecked++
}else{
do{
$results=Get-Job|Receive-Job -wait
get-job -State 'Completed'|remove-job
$results|%{
$currentDepth++
if($_['result']){
write-host "Bingo!" -ForegroundColor Green
get-job|remove-job
$firstResult=$_['links']
}elseif($currentDepth -le $maxDepth){
$addLinks=$_['links']|?{$_ -notin $knownLinks}
if($addLinks){
write-host "Adding new links to depth $currentDepth`:`r`n$(($addLinks|out-string).trim())"
$knownLinks+=$addLinks
$newLinks=$addLinks+$newLinks
}
}
}
$i=(get-job -state 'Running').count
}until($i -lt $simultaneousJobs -or $firstResult)
}
}until((!$newLinks -and !$i) -or $firstResult)
if($firstResult){
$totalMinutes=[math]::round($timer.Elapsed.TotalMinutes,2)
write-host "Minutes elapsed: $totalMinutes"
return $firstResult
}
}
$totalMinutes=[math]::round($timer.Elapsed.TotalMinutes,2)
write-host "$linksChecked links have been checked in $totalMinutes minutes without finding file extension $fileExtension" -ForegroundColor Red
return $false
}
findDownloadUrl $startUrl $fileExtension $maxDepth
PS C:\Users\concu> findDownloadUrl $startUrl $fileExtension $maxDepth Scanning for file extension .exe Processing link at current depth: 1 Parsing Adding new links to depth 2: https://tomcat.apache.org/tomcat-9.0-doc/deployer-howto.html https://tomcat.apache.org/tomcat-9.0-doc/changelog.html https://tomcat.apache.org/tomcat-9.0-doc/building.html https://tomcat.apache.org/ Parsing Parsing Parsing https://tomcat.apache.org/tomcat-9.0-doc/deployer-howto.html Parsing https://tomcat.apache.org/tomcat-9.0-doc/changelog.html Parsing https://tomcat.apache.org/tomcat-9.0-doc/building.html Parsing https://tomcat.apache.org/ Parsing Parsing Adding new links to depth 3: Bingo! Minutes elapsed: 0.08
# findDownloadUrl_v0.0.1.ps1
# This little algorithm will parse a web page for downloadable contents, matching given extensions
# Scripty currently is not optimized as multi-threading should greatly improve its performance
$startUrl='http :// apache.mirrors.pair.com/tomcat/tomcat-9/'
$fileExtension='.exe'
$maxDepth=2
function findDownloadUrl{
param(
$parentUrl,
$fileExtension,
$maxDepth=3
)
if(!$parentUrl){
write-warning "Cannot start with a blank parent URL"
}elseif($parentUrl -notmatch '/$'){
$parentUrl=$parentUrl+'/'
}
$page=Invoke-WebRequest $parentUrl
$links=$page.links.href|Select -Unique|sort -Descending|%{$parentUrl+$_}
$knownLinks=$links
function findFile($parentUrl,$extension){
$ProgressPreference='SilentlyContinue'
$ErrorActionPreference='stop'
if($parentUrl -notmatch '/$'){$parentUrl=$parentUrl+'/'}
#if(!([System.Net.WebRequest]::Create($parentUrl)).GetResponse().StatusCode -eq 200){
# return @($false,@())
#}
try{
$page=Invoke-WebRequest $parentUrl -TimeoutSec 10
}catch{
return @($false,@())
}
$newLinks=$page.links.href|?{($_ -notlike "*$(Split-Path $parentUrl -parent)") -and ($_ -notmatch '^http')}| `
sort -Descending|%{$parentUrl+$(
if($_[0] -eq '/'){
$_.Substring(1,$_.length-1)
}else{
$_
}
)}|select -Unique
$matchedExtension=$newLinks|?{$_ -like "*$extension"}|sort -Descending|select -First 1
if($matchedExtension){
return @($true,$matchedExtension)
}elseif($newLinks){
return @($false,$newLinks)
}else{
return @($false,@())
}
}
foreach ($link in $links){
write-host "Checking $link"
$currentDepth=1
$newLinks=@($link)
do{
$thisLink=$newLinks|Select -Unique|select -First 1
$newLinks=$newLinks[1..($newLinks.count-1)]
write-host "Parsing $thisLink"
$result=findFile $thisLink $fileExtension
if($result[0]){
write-host "Bingo!" -ForegroundColor Green
return $result[1]
}elseif(($currentDepth++ -le $maxDepth) -and ($result[1]|?{$_ -notin $knownLinks})){
$addLinks=$result[1]|?{$_ -notin $knownLinks}
write-host "Adding new links:`r`n$(($addLinks|out-string).trim())"
$knownLinks+=$addLinks
$newLinks=$addLinks+$newLinks
}
}until(!$newLinks)
}
write-host "$linksChecked links have been checked without any matching file extension $extension" -ForegroundColor Red
return $false
}
findDownloadUrl $startUrl $fileExtension $maxDepth
Categories: