1 Star 0 Fork 0

star_dev / osc_rec

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
sime_topic_match.py 6.47 KB
一键复制 编辑 原始数据 按行查看 历史
'''
author: starlee
@2019.03.07
利用简单的特征匹配进行推荐
1. 提取用户的兴趣特征topic
2. 用topic匹配文章,并过滤
3. 对文章进行排序
'''
from dbop import conn
class TopicRec:
'''推荐基类,提供基本接口'''
def __init__(self):
pass
def rec(self, user_id, article_num=10):
''' 为用户推荐N个文章'''
return []
class SimpleTopicRec(TopicRec):
def __init__(self,uid=None):
self.uid = uid
def user_topic(self, Top_N = 10):
'''返回用户所感兴趣的topic {topic_name: frequency}; 返回前Top_N个'''
# 以后应该有个专门流计算实时更新 user-topic的对应关系表
# 取出用户浏览过的文章
self.u_reads = set()
cursor = conn.cursor()
cursor.execute('select aid from "iTags"."reads" where uid=%s',(self.uid,))
read_articles = cursor.fetchall()
# print(">> User %d has read:"%self.uid,[item[0] for item in read_articles])
# 提取出文章的topic
u_topics = {}
for article in read_articles:# 用户如果多次阅读一个文章,目前就是当做两次独立的行为
self.u_reads.add(article[0])
cursor.execute('select tags from "iTags".articles where id=%s',article)
# 依次累加文章的各个topic的权重
a_topics = cursor.fetchone()
if a_topics is None or a_topics[0] is None: # shit!not found article / null tags
continue
a_topics = a_topics[0]
for _topic in a_topics.keys():
if _topic not in u_topics:
u_topics[_topic] = 0
u_topics[_topic] += a_topics[_topic]
# print(">> User %d likes topics %s"%(self.uid,u_topics))
cursor.close()
self.u_topics = {item[0]:item[1]
for item in sorted(u_topics.items(),key=lambda x:x[1],reverse=True)[:Top_N]
}
def __topic_match_score(self, tlst1, tlst2):
'''计算两个topic列表相似度得分
tlst1: {topic_name: frequency}
tlst2: {topic_name: weight(osc计算)}
'''
match_score = 0
for t_name, freq in tlst1.items():
if t_name in tlst2.keys():
match_score += freq * tlst2[t_name]
return match_score
def sort_article(self):
'''按照**策略对初步的推荐结果进行排序
(aid, score)
'''
# 先计算用户topic和每个文章topic的相似度
score = {}
for aid in self.matched_articles.keys():
a_topics = self.matched_articles[aid]["topics"]
topic_sim = self.__topic_match_score(self.u_topics, a_topics)
score[aid] = {"topic_sim":topic_sim}
# 接下来再考虑文章的其它属性
self.article_score = sorted(
[(item[0], item[1]["topic_sim"]) for item in score.items()],
key = lambda x: x[1],
reverse = True
)
# print(">> article score: ",self.article_score)
def article_filter(self):
'''对推荐结果做一定的过滤'''
# 删除已经看过的
for u_read in self.u_reads:
try:
del self.matched_articles[u_read]
except Exception:
pass
# print(">> after omiting read: ",self.matched_articles.keys())
# 其他的一些
def match_article(self,weight_lim = 0.1):
'''对每一个topi从,关联topic权重值大于weight_lim的文章'''
# 后面应该是专门一个表,保存article-topic的对应关系
cursor = conn.cursor()
self.matched_articles = {}
# 目前先考虑topic相似度和创建时间,后面可能还要考虑文章的阅读量、评论量(及情感分析)的指标
for topic in self.u_topics.keys():
cursor.execute('select id, tags, created_at from "iTags".articles where tags?%s',(topic,))
for item in cursor.fetchall():
aid, a_topics, a_time = item
# 用dict不怕多个tag重复匹配同一个article
self.matched_articles[aid] = {"topics": a_topics, "created_at": a_time}
cursor.close()
# print(">> matched articles: ", self.matched_articles.keys())
def rec(self, uid, article_num=10):
self.uid = uid
# step1. 先提取出用户的特征
self.user_topic()
# step2. 根据用户特征匹配文章
self.match_article()
# step3. 对文章进行过滤
self.article_filter()
# step4. 对文章进行排序
self.sort_article()
return self.article_score[:article_num]
if __name__ == "__main__":
user_id = 15
# # 完整测试
# smptpRec = SimpleTopicRec()
# rec_articles = smptpRec.rec(15)
# print(">> Recommended articles:")
# for ra in rec_articles:
# print("\t",ra)
import time
start_time = time.time()
# 分步测试
cursor = conn.cursor()
smptpRec = SimpleTopicRec(user_id)
# 看用户的topic
smptpRec.user_topic()
print("user_topic: ",time.time()-start_time)
for t in smptpRec.u_topics.items():
cursor.execute('select name from "iTags".tags where id=%s',(t[0],))
tag = cursor.fetchone()
if tag is None or tag[0] is None:
continue
print(tag[0], t[1])
print("*"*20)
# 推荐文章的情况
start_time = time.time()
smptpRec.match_article()
print("match_article: ",time.time()-start_time)
start_time = time.time()
smptpRec.article_filter()
print("filter_article: ",time.time()-start_time)
start_time = time.time()
smptpRec.sort_article()
print("sort_article: ",time.time()-start_time)
for at in smptpRec.article_score[:20]:
aid, a_score = at
cursor.execute('select title, tags, created_at from "iTags".articles where id=%s',(aid,))
a_info = cursor.fetchone()
if a_info is None:
continue
a_tags = []
for tid in a_info[1].keys():
cursor.execute('select name from "iTags".tags where id=%s',(tid,))
tag = cursor.fetchone()
if tag is None or tag[0] is None:
continue
a_tags.append(tag[0])
print(aid, "、".join(a_tags)," >> ", a_info[0]," >> ", a_info[2])
cursor.close()
1
https://gitee.com/lee-star/osc_rec.git
git@gitee.com:lee-star/osc_rec.git
lee-star
osc_rec
osc_rec
master

搜索帮助