代码拉取完成,页面将自动刷新
__author__ = 'Administrator'
from urllib2 import urlopen
import urlparse
import bs4
BASE_URL = "http://soccerdata.sports.qq.com"
PLAYER_LIST_QUERY = "/playerSearch.aspx?lega=%s&pn=%d"
league = ['epl','seri','bund','liga','fran','scot','holl','belg']
page_number_limit = 100
player_fields = ['league_cn','img','name_cn','name','team','age','position_cn','nation','birth','query','id','teamid','league']
def get_players(baseurl):
html = urlopen(baseurl).read()
soup = bs4.BeautifulSoup(html, "lxml")
players = [ dd for dd in soup.select('.searchResult tr') if dd.contents[1].name != 'th']
result = []
for player in players:
record = []
link = ''
query = []
for item in player.contents:
if type(item) is bs4.element.Tag:
if not item.string and item.img:
record.append(item.img['src'])
else :
record.append(item.string and item.string.strip() or 'na')
try:
o = urlparse.urlparse(item.a['href']).query
if len(link) == 0:
link = o
query = dict([(k,v[0]) for k,v in urlparse.parse_qs(o).items()])
except:
pass
if len(record) != 10:
for i in range(0, 10 - len(record)):
record.append('na')
record.append(unicode(link,'utf-8'))
record.append(unicode(query["id"],'utf-8'))
record.append(unicode(query["teamid"],'utf-8'))
record.append(unicode(query["lega"],'utf-8'))
result.append(record)
return result
result = []
for url in [ BASE_URL + PLAYER_LIST_QUERY % (l,n) for l in league for n in range(page_number_limit) ]:
result = result + get_players(url)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。