有道字典蜘蛛程序
有意想做一个iPhone上的字典程序,就写了个小巧的脚本,根据提供的单词列表,抓取有道字典的资源。
#!/usr/bin/python
#encoding=utf-8
import sys,time,codecs,socket
from urllib2 import urlopen,quote
from re import S,sub,compile
from BeautifulSoup import *
reload(sys)
sys.setdefaultencoding('utf-8')
socket.setdefaulttimeout(10)
baseURL = "http://dict.youdao.com/search?le=eng&q=";
startTime = time.time()
startTimeString = time.strftime('%Y-%m-%d %X',time.localtime(startTime))
logFileName = 'log.txt'
logFile = codecs.open(logFileName,'a','utf-8')
hulueFileName = "hulue.txt"
hulueFile = open(hulueFileName,'a')
resultFileName = "result.txt"
resultFile = codecs.open(resultFileName,'a','utf-8')
wordListFileName = 'wordlist.txt'
wordListFile = open(wordListFileName,'r')
lines = wordListFile.readlines()
wordListFile.close()
zhujiFileName = "zhuji.db"
zhujiFile = codecs.open(zhujiFileName,'r','gbk')
zhujiFileContent = zhujiFile.readlines()
zhujiFile.close()
def log(message):
print message
logFile.write(message+'\n')
def getRemoteFileFromWebForWord(word):
try:
remoteFile = urlopen(baseURL+quote(word)).read()
except:
remoteFile = None
return remoteFile
def getZhujiForWord(word):
regExp = re.compile(r"^%s\t.*" % word)
for line in zhujiFileContent:
result = regExp.match(line)
if not result is None:
return re.sub(r'^%s\t' % word,'',result.group())
return None
def bannedByRemote(soup):
checkcodeappeared = soup.find('td',{'class':'titlebluestg1'})
if not checkcodeappeared is None and checkcodeappeared.getText() == u'请输入验证码':
return True
else:
return False
def getJieshiFromSoup(soup):
jieshis = soup.findAll("td",{"class":"attributem1web"});
if len(jieshis) == 0:
jieshis = soup.findAll("span",{"class":"meaning"});
if len(jieshis) == 0:
jieshis = soup.findAll("div",{"class":"meaning"});
if len(jieshis) == 0:
jieshis = soup.findAll("td",{"class":"dttitle2"});
if len(jieshis) == 0:
jieshis = soup.find("font",{"class":"p13"});
if not jieshis is None:
if len(jieshis) == 0:
return None
else:
return jieshis
else:
return jieshis
log('Start session at '+startTimeString)
totalLines = float(len(lines))
currentLine = 0.0
for line in lines:
currentLine+=1
elapsedTime = time.time() - startTime
elapsedTimeString = time.strftime('%X',time.gmtime(elapsedTime))
estimateFinishTimeString =time.strftime('%X',time.gmtime((elapsedTime / currentLine)*(totalLines-currentLine)))
percent = currentLine/totalLines*100
word = sub('\n','',line)
print u"处理 %s 当前进度:%d/%d (%.4f%%) 已用时间:%s 剩余时间:%s" % (word,currentLine,totalLines,percent,elapsedTimeString,estimateFinishTimeString)
remoteFile = getRemoteFileFromWebForWord(word)
while remoteFile is None:
log(u"********** %s 远程页面加载失败,30秒后重试" % word)
time.sleep(30)
remoteFile = getRemoteFileFromWebForWord(word)
soup = BeautifulSoup(remoteFile)
if bannedByRemote(soup):
log(u"********** %s IP暂时被封,程序退出" % word)
break
resultLineContent = ''
#音标
yinbiao = soup.find("span",{"class":"pronounce"});
if not yinbiao is None:
resultLineContent += sub(' ','',yinbiao.getText())
else:
log(u"*** %s 音标未找到" % word)
resultLineContent+='\t'
#解释
jieshis = getJieshiFromSoup(soup)
if not jieshis is None:
if isinstance(jieshis,ResultSet):
for jieshi in jieshis:
resultLineContent+=sub(' ','',jieshi.getText())
if jieshi != jieshis[-1]:
resultLineContent+='|'
elif isinstance(jieshis,Tag):
resultLineContent+=sub(' ','',jieshis.getText())
else:
pass
else:
log(u"*** %s 解释未找到" % word)
resultLineContent+='\t'
if yinbiao is None and jieshis is None:
log(u"***** %s 已忽略,因为音标与解释均未找到" % word)
hulueFile.write(word+'\n')
continue
resultLineContent = word+'\t'+resultLineContent
#助记
zhuji = getZhujiForWord(word)
if not zhuji is None:
#助记存在,格式化掉影响排版的控制符
zhuji = sub('\t','',zhuji)
resultLineContent += zhuji
else:
log(u"*** %s 辅助记忆未找到" % word)
resultLineContent+='\t'
#例句
lijuDivNames = ["ssResultDiv1","ssResultDiv2","ssResultDiv3"]
for lijuDivName in lijuDivNames:
lijuSoup = soup.find(id=lijuDivName)
if lijuSoup is None:
continue
lijus = lijuSoup.findAll("span")
if len(lijus) > 2:
del(lijus[2])
for liju in lijus:
# #号表示关键词加重显示
resultLineContent+=liju.getText("#")
# 去掉最后一行的分隔符
if lijuDivName == lijuDivNames[-1] and liju == lijus[-1]:
break
resultLineContent+='|'
resultLineContent+='\t\n'
resultFile.write(resultLineContent)
logFile.close()
resultFile.close()
hulueFile.close()
有道字典蜘蛛程序
http://hellohtml5.blogspot.com/2010/05/blog-post_22.html
没有评论:
发表评论