#encoding:UTF-8 import urllib.request import bs4 from bs4 import BeautifulSoup import re from urllib.parse import urlparse defWriteFile(path,str): f = open(path,"a+") f.write(str+"\n")
for i inrange(1,134): i = str(i) + ".htm" if(i == "1.htm"): i = "" url = "http://www.mycodes.net/5/" + i print(url+"\n") ################################################## #获取列表页中a标签的href内容 response = urllib.request.urlopen(url).read() html = response.decode('gbk','ignore') soup = BeautifulSoup(html,"html.parser") table = soup.find_all("table",width=re.compile("97%"), border="0")
from urllib import request f = open(r"url.txt","r") lines = f.readlines()#读取全部内容 for line in lines: downUrl = line.replace('\n','') print(downUrl) filename = downUrl.split('/')[-1] print(filename) with request.urlopen(downUrl) as reponse: withopen(filename, 'wb') as outFile: outFile.write(reponse.read())
for index inrange(len(pathList)): pathList[index] = pathList[index].lower() parseResult = urlparse(pathList[index]) path = parseResult.path if(path[0:1]=='/'): path = path[1:] pathList[index] = path
while''in pathList: pathList.remove('')
pathList = list(set(pathList)) print(pathList)
# sys.exit()
print('\n\n\n\n') f = open("log.txt","r")
lines = f.readlines()#读取全部内容 for line in lines: line = line.replace('\n','').lower() for pic in picList: if(line.find(pic)>-1): if((infoDict.get(line.split(':')[1])==None) or (pic notin infoDict.get(line.split(':')[1]))): key = line.split(':')[1] value = pic infoDict.setdefault(key,[]).append(value) if(line.split(':')[1] in resultDict): resultDict[line.split(':')[1]] = resultDict[line.split(':')[1]] + 1 else: resultDict[line.split(':')[1]] = 1
for js in jsList: if(line.find(js)>-1): if((infoDict.get(line.split(':')[1])==None) or (js notin infoDict.get(line.split(':')[1]))): key = line.split(':')[1] value = js infoDict.setdefault(key,[]).append(value) if(line.split(':')[1] in resultDict): resultDict[line.split(':')[1]] = resultDict[line.split(':')[1]] + 1 else: resultDict[line.split(':')[1]] = 1 for css in cssList: if(line.find(css)>-1): if((infoDict.get(line.split(':')[1])==None) or (css notin infoDict.get(line.split(':')[1]))): key = line.split(':')[1] value = css infoDict.setdefault(key,[]).append(value) if(line.split(':')[1] in resultDict): resultDict[line.split(':')[1]] = resultDict[line.split(':')[1]] + 1 else: resultDict[line.split(':')[1]] = 1 for path in pathList: if(line.find(path)>-1): if((infoDict.get(line.split(':')[1])==None) or (path notin infoDict.get(line.split(':')[1]))): key = line.split(':')[1] value = path infoDict.setdefault(key,[]).append(value) if(line.split(':')[1] in resultDict): resultDict[line.split(':')[1]] = resultDict[line.split(':')[1]] + 1 else: resultDict[line.split(':')[1]] = 1