一个简单的Python写的XML爬虫
来源:程序员人生 发布时间:2013-11-06 16:22:29 阅读次数:2851次
原理很简单,读XML结构,返回值,判断,根据返回的值得到下一个XML的地址,判断
PY的class和PHP差不多,思路是一样的
#-*- encoding: utf-8 -*-
import codecs
import sys
import threading
from urllib import urlencode
from urllib2 import urlopen
from xml.dom.minidom import parseString
class Serach:
def __init__(self, key=None):
self.key = key
def SendPy(self, key):
try:
contentpy = urlopen("http://xxxx.com/ac_box?ac=" + self.key).read()
except:
print ("down load py!")
try:
xmldoc = parseString(contentpy)
except:
print ("ill formed xml file")
root = xmldoc.documentElement
''分析XML的结构,得到数组
keyList = root.getElementsByTagName('SuggestWord')
return keyList
def SendKey(self, keyword):
keyword = keyword.encode('gbk')
tupleList = []
try:
''读XML地址,转码
content = urlopen("http://xxxx.com/btinfo?keyword=" + keyword + "&num=1").read()
content = unicode(content, "cp936").encode("utf-8")
except:
print ("down load key!")
''替换
content = content.replace('''<?xml version="1.0" encoding="gbk"?>''', '''<?xml version="1.0" encoding="utf-8"?>''')
try:
xmldoc = parseString(content)
except:
print ("ill formed xml file")
try:
query = xmldoc.getElementsByTagName('Query')[0]
tupleList = query.getAttribute('ErrorCode')
except:
tupleList = 104
return tupleList
def run(self):
ls = self.SendPy(self.key)
count = len(self.key)
cur = self.conn.cursor()
str = ''
for doc in ls:
tuple = doc.firstChild.data
text = self.SendKey(tuple)
if text == '0':
test = self.MySQLKey(tuple)
if test != '2':
str = str + tuple + '|' + test + ','
if count > 3:
sitetag = self.MySQLPy(self.key)
if sitetag != ():
for x in sitetag:
tsql = "xxxx"
cur.execute(tsql)
#print(cur.fetchall())
for s in cur.fetchall():
if (s[0]=='rmvb') or (s[0]=='rm'):
r = '0'
else:
r = '1'
str = str + x[0] + '|' + r + ','
str = str[:-1]
else:
str = str[:-1]
#转成数组后过滤重复字段
strtag = list(set(str.split(',')))
sText = ','.join(strtag)
file_object = codecs.open(self.savePath + self.key + '.txt', 'w', 'utf-8')
file_object.write(sText)
file_object.close()
if __name__ == "__main__":
if len(sys.argv) > 1:
s = Serach(sys.argv[1]);
s.run()