3 Star 0 Fork 0

LJ / Soccers

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
GetDate.py 1.77 KB
AI 代码解读
一键复制 编辑 原始数据 按行查看 历史
LJ 提交于 2015-03-01 19:05 . 添加spider Match500 和其它
__author__ = 'Administrator'
from urllib2 import urlopen
import urlparse
import bs4
BASE_URL = "http://soccerdata.sports.qq.com"
PLAYER_LIST_QUERY = "/playerSearch.aspx?lega=%s&pn=%d"
league = ['epl','seri','bund','liga','fran','scot','holl','belg']
page_number_limit = 100
player_fields = ['league_cn','img','name_cn','name','team','age','position_cn','nation','birth','query','id','teamid','league']
def get_players(baseurl):
html = urlopen(baseurl).read()
soup = bs4.BeautifulSoup(html, "lxml")
players = [ dd for dd in soup.select('.searchResult tr') if dd.contents[1].name != 'th']
result = []
for player in players:
record = []
link = ''
query = []
for item in player.contents:
if type(item) is bs4.element.Tag:
if not item.string and item.img:
record.append(item.img['src'])
else :
record.append(item.string and item.string.strip() or 'na')
try:
o = urlparse.urlparse(item.a['href']).query
if len(link) == 0:
link = o
query = dict([(k,v[0]) for k,v in urlparse.parse_qs(o).items()])
except:
pass
if len(record) != 10:
for i in range(0, 10 - len(record)):
record.append('na')
record.append(unicode(link,'utf-8'))
record.append(unicode(query["id"],'utf-8'))
record.append(unicode(query["teamid"],'utf-8'))
record.append(unicode(query["lega"],'utf-8'))
result.append(record)
return result
result = []
for url in [ BASE_URL + PLAYER_LIST_QUERY % (l,n) for l in league for n in range(page_number_limit) ]:
result = result + get_players(url)
Python
1
https://gitee.com/nikytwo/Soccers.git
git@gitee.com:nikytwo/Soccers.git
nikytwo
Soccers
Soccers
master

搜索帮助