#!/usr/bin/env python3 #import beautifulsoup4 import urllib.parse import bs4 import sys import requests import time import datetime # time.sleep(1) headers = { 'User-Agent': 'Other.Big.Crawler.Baby', 'From': 'xxx' # an email address } ignore = [ 'youtube', 'instagram', 'twitter', 'facebook', 'linkedin', 'tiktok' ] def fileHandler( openType , filename, input): if openType =='a' or openType =='w': f = open(filename, openType) f.write(input) f.close() def htmlHeader(filename): now = datetime.datetime.now() html = f""" {filename}
Page generated at {now.strftime("%Y-%m-%d %H:%M:%S")} 00:00:00

	"""
	fileHandler( 'w' , filename, html)
	

def htmlFooter(filename):
	html = f"""
	

{sys.argv[0]}

""" fileHandler( 'a' , filename, html) def notRelative(i): if i is None: return False if i.startswith("https:") or i.startswith("http:"): return True else: return False def urlsplit(i): return urllib.parse.urlsplit(i) def removeDup(i): return list(dict.fromkeys(i)) def crawlPage(link): for i in ignore: if i in link: print( 'skipping: ' + i + ' ' + link) return False urlOnly = False doneNetLoc = [] linkList = [] html = requests.get(link, headers=headers) soup = bs4.BeautifulSoup(html.text, "html.parser") # print('###### href links in ' + link + ' ######') for links in soup.findAll("a"): if notRelative(links.get("href")): url = urlsplit( links.get("href") ) if urlOnly: linkList.append( url.scheme + '://' + url.netloc ) else: if url.netloc not in doneNetLoc: doneNetLoc.append(url.netloc) linkList.append( links.get("href") ) return linkList def combinePageLinks(link): links = crawlPage(link) if links: return [link, links] else: return False def addLinksList(arrA , arrB ): if isinstance(arrB, list): for i in arrB: if isinstance(i, list): for j in i: arrA.append(j) else: arrA.append(i) return arrA def addSocialGraph(arrA , link , arrB ): if isinstance(arrB, list): arrA.append( [ link, arrB ] ) return arrA if sys.version_info[0] == 3: version = 'python3' else: version = 'python' done = [] todo = [] socialGraph = [] iterator = 0 limit = 10 if len(sys.argv) < 2: print( 'No commandline arguments entered. Can enter multiple URLs' ) print( 'Example:' ) print( version + ' ' + sys.argv[0] + ' ' + str(limit) + ' http://example.com' ) print( version + ' ' + sys.argv[0] +' limit the_first_webpage' ) sys.exit() else: for count, link in enumerate(sys.argv): if count > 0: if count == 1: limit = link else: todo.append(link) while (iterator < int(limit)): if len(todo) == 0: iterator= int(4) * int(limit) else: next = todo.pop() done.append(next) combined = combinePageLinks(next) # link and array_of_links # print(combined[ 1 ]) if combined and len(combined[ 1 ]) > 0: socialGraph = addSocialGraph(socialGraph , next , combined[ 1 ] ) todo = addLinksList(todo , combined[ 1 ] ) todo = removeDup(todo) iterator = int(iterator) + int(1) # print(iterator) # time.sleep(1) ''' for count, link in enumerate(socialGraph): print( '+----+' + link[0] ) for count, href in enumerate( link[1] ): print( ' |') print( ' +--- ' + str(count+1) + '. ' + href + '\n') ''' filename = 'crawl.output.html' htmlHeader(filename) for count, link in enumerate(socialGraph): fileHandler('a', filename, "\n\n+---+" + link[0] + "" ) for count, href in enumerate( link[1] ): html ="\n    |\n    +---" + str(count+1) + " " + href + "" fileHandler('a', filename, html) htmlFooter(filename)