<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/rss2full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/" xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" version="2.0">

<channel>
	<title>xlvector - Recommender System</title>
	
	<link>http://xlvector.net/blog</link>
	<description>如果翻墙，可以更好的浏览这个blog</description>
	<lastBuildDate>Wed, 01 Feb 2012 12:46:08 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.3</generator>
		<atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/rss+xml" href="http://feeds.feedburner.com/blogspot/SHpi" /><feedburner:info uri="blogspot/shpi" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><feedburner:feedFlare href="http://fusion.google.com/add?feedurl=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://buttons.googlesyndication.com/fusion/add.gif">Subscribe with Google</feedburner:feedFlare><feedburner:feedFlare href="http://www.plusmo.com/add?url=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://plusmo.com/res/graphics/fbplusmo.gif">Subscribe with Plusmo</feedburner:feedFlare><feedburner:feedFlare href="http://www.thefreedictionary.com/_/hp/AddRSS.aspx?http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://img.tfd.com/hp/addToTheFreeDictionary.gif">Subscribe with The Free Dictionary</feedburner:feedFlare><feedburner:feedFlare href="http://www.bitty.com/manual/?contenttype=rssfeed&amp;contentvalue=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://www.bitty.com/img/bittychicklet_91x17.gif">Subscribe with Bitty Browser</feedburner:feedFlare><feedburner:feedFlare href="http://www.newsalloy.com/?rss=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://www.newsalloy.com/subrss3.gif">Subscribe with NewsAlloy</feedburner:feedFlare><feedburner:feedFlare href="http://www.live.com/?add=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://tkfiles.storage.msn.com/x1piYkpqHC_35nIp1gLE68-wvzLZO8iXl_JMledmJQXP-XTBOLfmQv4zhj4MhcWEJh_GtoBIiAl1Mjh-ndp9k47If7hTaFno0mxW9_i3p_5qQw">Subscribe with Live.com</feedburner:feedFlare><feedburner:feedFlare href="http://mix.excite.eu/add?feedurl=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://image.excite.co.uk/mix/addtomix.gif">Subscribe with Excite MIX</feedburner:feedFlare><feedburner:feedFlare href="http://download.attensa.com/app/get_attensa.html?feedurl=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://www.attensa.com/blogs/attensa/WindowsLiveWriter/BadgeredintoBadges_10C02/attensa_feed_button5.gif">Subscribe with Attensa for Outlook</feedburner:feedFlare><feedburner:feedFlare href="http://www.webwag.com/wwgthis.php?url=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://www.webwag.com/images/wwgthis.gif">Subscribe with Webwag</feedburner:feedFlare><feedburner:feedFlare href="http://www.podcastready.com/oneclick_bookmark.php?url=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://www.podcastready.com/images/podcastready_button.gif">Subscribe with Podcast Ready</feedburner:feedFlare><feedburner:feedFlare href="http://www.flurry.com/pushRssFeed.do?r=fb&amp;url=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://www.flurry.com/images/flurry_rss_logo2.gif">Subscribe with Flurry</feedburner:feedFlare><feedburner:feedFlare href="http://www.wikio.com/subscribe?url=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://www.wikio.com/shared/img/add2wikio.gif">Subscribe with Wikio</feedburner:feedFlare><feedburner:feedFlare href="http://www.dailyrotation.com/index.php?feed=http%3A%2F%2Ffeeds.feedburner.com%2Fblogspot%2FSHpi" src="http://www.dailyrotation.com/rss-dr2.gif">Subscribe with Daily Rotation</feedburner:feedFlare><feedburner:browserFriendly>generated by xlvector</feedburner:browserFriendly><item>
		<title>《推荐系统实践》总结</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/ePz4X46xEGc/</link>
		<comments>http://xlvector.net/blog/?p=830#comments</comments>
		<pubDate>Wed, 01 Feb 2012 12:46:08 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=830</guid>
		<description><![CDATA[最近一直在写这本书，所以blog也没有更新。经过一个春节的奋战，这本书终于到了收尾阶段。目前主要在完善已有的内容，调整一些章节次序。所以写篇blog暂时总结一下写作过程。 写本书远远比想象的要困难，而且比写博士论文困难多了。因为博士论文你可以假设没多少人会看，而且看的人水平和自己相似。此外，博士论文也就100多页。目前这本书大概写了200页，内容倒是挺全面，方方面面，边边角角都涉及到了。不过都是蜻蜓点水，仅供入门之用。而且显得有点杂。 其实推荐系统很难成为一个独立的学科，因为他用到的知识是其他领域都会用到的，只是用在了推荐这个问题上而已。下面，以FAQ的形式总结一下书的体系和内容。 问题：本书的定位是？ 回答：本书注重广度，忽略深度。考虑到这是国内第一本同类型的书，所以还是以入门为主。其实更重要的原因是考虑到自身的限制，个人对理论问题不是特别了解，所以怕写错了贻笑大方。而且，深入的问题可以从书中找到相关论文进行深入研究，所以本书主要起介绍作用。 问题：书里面有代码吗？ 回答：有，虽然我号称书里面的代码都是Python，不过经Python高手反应，说写的不像Python。不过，书里面的代码主要是作为一种沟通语言存在，起到解释公式的作用。所以书中的代码比较注重可读性，效率什么的考虑不多。不过，书中有大量的实验，这些实验是通过另外一套注重效率的代码实现的，当然这些代码的可读性就差一点。所以，希望大家把书中的代码当成伪码，其实很多书里面印的代码我都没有实际运行过。在书上的代码之外，我会提供一个真实的可以运行的代码，估计会放在诸如google code这种地方。 问题：Top-N推荐和评分预测问题，更侧重于哪个？ 回答：侧重于Top-N推荐，整个书用了8章讲Top-N推荐，1章讲评分预测。这似乎和这两个问题在学术界的论文数量是颠倒的，不过个人认为TopN推荐更符合实际，而且从来Hulu的近两年的实践看，几乎没有用到评分预测的技术。但是，评分预测问题作为Netflix Prize的著名问题，不说又不好。所以在最后一章介绍了这个问题的著名算法。 问题：本书是不是充满了公式？ 回答：我没办法做到霍金那样一本书只有一个E=mc^2的公式。所以这本书主要采用三种语言和读者交流。第一种当然是中文，希望通过例子说清楚一个算法。第二种是公式，对于搞研究的朋友这这个可能很直接。第三种是代码，主要是为软件工程师考虑，他们容易看懂。 之前公开过一个粗略的目录，这里给出一个目前详细的目录，大家应该能够看出大概。不过这个目录还要大改，不过现在可以凑和看看。 第一章 什么是好的推荐系统 5 第1节 什么是推荐系统 6 第2节 个性化推荐系统的应用 8 2.1 电子商务 9 2.2 电影和视频网站 12 2.3 个性化音乐电台 14 2.4 社会网络 17 2.5 个性化阅读 19 2.6 基于位置的服务 20 2.7 个性化邮件 21 2.8 个性化广告 22 第3节 推荐系统的评测 23 3.1 推荐系统实验方法 25 3.2 评测指标 [...]<table class="wumii-related-items" cellspacing="0" cellpadding="2" border="0" width="100%" style="clear: both;">
    
    <tr>
        <td ><b><font size="-1"  style="display: block !important; padding: 20px 0 5px !important;">您可能也喜欢：</font></b></td>
    </tr>
    
            <tr>
                <td style="margin: 0 !important; padding: 0 !important; line-height: 20px !important;">
                    <img border="0" src="http://static.wumii.com/images/widget/widget_solidPoint.gif">
                    <a target="_blank" style="text-decoration: none !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D824&from=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D830">
                        <font size="-1" color="#333333" style="line-height: 1.65em; font-size: 12px !important;">《推荐系统实践》关于Latent Factor Model</font>
                    </a>
                </td>
            </tr>
            <tr>
                <td style="margin: 0 !important; padding: 0 !important; line-height: 20px !important;">
                    <img border="0" src="http://static.wumii.com/images/widget/widget_solidPoint.gif">
                    <a target="_blank" style="text-decoration: none !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D451&from=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D830">
                        <font size="-1" color="#333333" style="line-height: 1.65em; font-size: 12px !important;">近期推荐系统方面文章总结(非具体技术文章)</font>
                    </a>
                </td>
            </tr>
            <tr>
                <td style="margin: 0 !important; padding: 0 !important; line-height: 20px !important;">
                    <img border="0" src="http://static.wumii.com/images/widget/widget_solidPoint.gif">
                    <a target="_blank" style="text-decoration: none !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D821&from=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D830">
                        <font size="-1" color="#333333" style="line-height: 1.65em; font-size: 12px !important;">推荐系统实践样章部分公布</font>
                    </a>
                </td>
            </tr>
            <tr>
                <td style="margin: 0 !important; padding: 0 !important; line-height: 20px !important;">
                    <img border="0" src="http://static.wumii.com/images/widget/widget_solidPoint.gif">
                    <a target="_blank" style="text-decoration: none !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D400&from=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D830">
                        <font size="-1" color="#333333" style="line-height: 1.65em; font-size: 12px !important;">Twitter的用户推荐系统</font>
                    </a>
                </td>
            </tr>
            <tr>
                <td style="margin: 0 !important; padding: 0 !important; line-height: 20px !important;">
                    <img border="0" src="http://static.wumii.com/images/widget/widget_solidPoint.gif">
                    <a target="_blank" style="text-decoration: none !important;" href="http://app.wumii.com/ext/redirect.htm?url=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D682&from=http%3A%2F%2Fxlvector.net%2Fblog%2F%3Fp%3D830">
                        <font size="-1" color="#333333" style="line-height: 1.65em; font-size: 12px !important;">各个领域著名的推荐系统</font>
                    </a>
                </td>
            </tr>
    
    <tr>
        <td  align="right">
            <a style="text-decoration: none !important;" href="http://www.wumii.com/widget/relatedItems.htm" target="_blank" title="无觅相关文章插件">
                <font size="-1" color="#bbbbbb" style="display: block !important; font-family: arial !important; padding: 5px 0 !important; font-size: 12px !important; color: #bbb !important;">无觅</font>
            </a>
        </td>
    </tr>
</table>]]></description>
			<content:encoded><![CDATA[<p>最近一直在写这本书，所以blog也没有更新。经过一个春节的奋战，这本书终于到了收尾阶段。目前主要在完善已有的内容，调整一些章节次序。所以写篇blog暂时总结一下写作过程。</p>
<p>写本书远远比想象的要困难，而且比写博士论文困难多了。因为博士论文你可以假设没多少人会看，而且看的人水平和自己相似。此外，博士论文也就100多页。目前这本书大概写了200页，内容倒是挺全面，方方面面，边边角角都涉及到了。不过都是蜻蜓点水，仅供入门之用。而且显得有点杂。</p>
<p>其实推荐系统很难成为一个独立的学科，因为他用到的知识是其他领域都会用到的，只是用在了推荐这个问题上而已。下面，以FAQ的形式总结一下书的体系和内容。</p>
<p>问题：本书的定位是？<br />
回答：本书注重广度，忽略深度。考虑到这是国内第一本同类型的书，所以还是以入门为主。其实更重要的原因是考虑到自身的限制，个人对理论问题不是特别了解，所以怕写错了贻笑大方。而且，深入的问题可以从书中找到相关论文进行深入研究，所以本书主要起介绍作用。</p>
<p>问题：书里面有代码吗？<br />
回答：有，虽然我号称书里面的代码都是Python，不过经Python高手反应，说写的不像Python。不过，书里面的代码主要是作为一种沟通语言存在，起到解释公式的作用。所以书中的代码比较注重可读性，效率什么的考虑不多。不过，书中有大量的实验，这些实验是通过另外一套注重效率的代码实现的，当然这些代码的可读性就差一点。所以，希望大家把书中的代码当成伪码，其实很多书里面印的代码我都没有实际运行过。在书上的代码之外，我会提供一个真实的可以运行的代码，估计会放在诸如google code这种地方。</p>
<p>问题：Top-N推荐和评分预测问题，更侧重于哪个？<br />
回答：侧重于Top-N推荐，整个书用了8章讲Top-N推荐，1章讲评分预测。这似乎和这两个问题在学术界的论文数量是颠倒的，不过个人认为TopN推荐更符合实际，而且从来Hulu的近两年的实践看，几乎没有用到评分预测的技术。但是，评分预测问题作为Netflix Prize的著名问题，不说又不好。所以在最后一章介绍了这个问题的著名算法。</p>
<p>问题：本书是不是充满了公式？<br />
回答：我没办法做到霍金那样一本书只有一个E=mc^2的公式。所以这本书主要采用三种语言和读者交流。第一种当然是中文，希望通过例子说清楚一个算法。第二种是公式，对于搞研究的朋友这这个可能很直接。第三种是代码，主要是为软件工程师考虑，他们容易看懂。</p>
<p>之前公开过一个粗略的目录，这里给出一个目前详细的目录，大家应该能够看出大概。不过这个目录还要大改，不过现在可以凑和看看。</p>
<p>第一章	什么是好的推荐系统	5<br />
第1节	什么是推荐系统	6<br />
第2节	个性化推荐系统的应用	8<br />
2.1	电子商务	9<br />
2.2	电影和视频网站	12<br />
2.3	个性化音乐电台	14<br />
2.4	社会网络	17<br />
2.5	个性化阅读	19<br />
2.6	基于位置的服务	20<br />
2.7	个性化邮件	21<br />
2.8	个性化广告	22<br />
第3节	推荐系统的评测	23<br />
3.1	推荐系统实验方法	25<br />
3.2	评测指标	26<br />
第二章	如何解决推荐系统冷启动问题	37<br />
第1节	冷启动问题简介	37<br />
第2节	利用用户注册信息	38<br />
第3节	选择合适的物品启动用户的兴趣	44<br />
第4节	利用物品的内容信息	48<br />
第5节	发挥专家的作用	51<br />
第6节	总结	53<br />
第三章	如何利用用户行为数据	54<br />
第1节	用户行为数据简介	55<br />
1.1	用户行为的分类	55<br />
1.2	用户行为分析	57<br />
1.3	实验设计和算法评测	60<br />
1.4	基于用户的协同过滤算法	63<br />
1.5	基于物品的协同过滤算法	68<br />
1.6	UserCF和ItemCF的综合比较	76<br />
1.7	融合UserCF和ItemCF算法	80<br />
第2节	隐语义模型	81<br />
2.1	基于LFM的实际系统的例子	87<br />
2.2	LFM和基于邻域的方法的比较	87<br />
第3节	基于图的模型	88<br />
3.1	用户行为数据的二分图表示	88<br />
3.2	基于图的推荐算法	89<br />
第四章	如何利用物品的内容信息	92<br />
第1节	简介	92<br />
第2节	文本关键词的抽取	93<br />
第3节	基于物品的内容过滤算法	93<br />
第五章	如何利用用户标签数据	94<br />
第1节	UGC标签系统的代表应用	95<br />
1.1	Delicious	95<br />
1.2	CiteULike	96<br />
1.3	Lastfm	96<br />
1.4	豆瓣	97<br />
1.5	Hulu	97<br />
第2节	标签系统中的推荐问题	98<br />
2.1	用户为什么要标注	98<br />
2.2	用户如何打标签	99<br />
2.3	用户打什么样的标签	100<br />
第3节	基于标签的推荐系统	101<br />
3.1	实验设置	102<br />
3.2	一个最简单的算法	105<br />
3.3	算法的改进	106<br />
3.4	基于图的推荐算法	109<br />
3.5	基于标签的推荐解释	114<br />
第4节	给用户推荐标签	115<br />
4.1	为什么要给用户推荐标签	115<br />
4.2	如何给用户推荐标签	115<br />
4.3	实验设置	117<br />
4.4	基于图的标签推荐算法	118<br />
第5节	总结	119<br />
第六章	如何利用上下文信息	120<br />
第1节	上下文推荐系统简介	120<br />
第2节	时间上下文信息	121<br />
2.1	时间效应简介	121<br />
2.2	时间效应举例	122<br />
2.3	系统时间特性的分析	123<br />
2.4	推荐算法的时间多样性	125<br />
2.5	时间上下文推荐算法	127<br />
2.6	时间段图模型	131<br />
2.7	离线实验	133<br />
第3节	地点上下文信息	138<br />
3.1	地点上下文简介	138<br />
3.2	基于位置的推荐算法	139<br />
第七章	如何利用社会网络数据	142<br />
第1节	简介	143<br />
第2节	获取社会网络数据的途径	144<br />
2.1	电子邮件	144<br />
2.2	用户登录信息	145<br />
2.3	用户的位置数据	145<br />
2.4	论坛和讨论组	145<br />
2.5	即时聊天工具	145<br />
2.6	社交网站	146<br />
第3节	社会网络数据简介	147<br />
第4节	基于社会网络的推荐	148<br />
4.1	简介	148<br />
4.2	基于邻域的社会化推荐算法	150<br />
4.3	基于图的社会化推荐算法	150<br />
4.4	实际系统中的社会化推荐算法	152<br />
4.5	社会化推荐系统和协同过滤推荐系统	153<br />
第5节	给用户推荐好友	154<br />
5.1	基于内容的匹配	156<br />
5.2	基于共同兴趣的好友推荐	156<br />
5.3	基于社会网络图的好友推荐	156<br />
第八章	实际推荐系统的例子	159<br />
第1节	外围架构	159<br />
1.1	数据收集和存储	160<br />
第2节	在线推荐系统架构图	161<br />
2.1	推荐系统由多个推荐引擎构成	162<br />
2.2	推荐引擎的架构	163<br />
第3节	离线推荐模块设计	167<br />
3.1	特征物品相关表	167<br />
第九章	评分预测问题	168<br />
第1节	问题简介	168<br />
第2节	离线实验方法	168<br />
第3节	评分预测问题算法	169<br />
3.1	平均值	169<br />
3.2	基于邻域的方法	170<br />
3.3	隐语义模型/矩阵分解模型	171<br />
3.4	加入时间信息	177<br />
3.5	模型融合	177<br />
第十章	总结	179</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=ePz4X46xEGc:aB1VcyNDL8c:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=ePz4X46xEGc:aB1VcyNDL8c:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=ePz4X46xEGc:aB1VcyNDL8c:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=ePz4X46xEGc:aB1VcyNDL8c:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=ePz4X46xEGc:aB1VcyNDL8c:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=ePz4X46xEGc:aB1VcyNDL8c:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=ePz4X46xEGc:aB1VcyNDL8c:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=ePz4X46xEGc:aB1VcyNDL8c:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=ePz4X46xEGc:aB1VcyNDL8c:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=ePz4X46xEGc:aB1VcyNDL8c:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=ePz4X46xEGc:aB1VcyNDL8c:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/ePz4X46xEGc" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=830</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=830</feedburner:origLink></item>
		<item>
		<title>《推荐系统实践》关于Latent Factor Model</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/tXrnPLQr0Uo/</link>
		<comments>http://xlvector.net/blog/?p=824#comments</comments>
		<pubDate>Thu, 22 Dec 2011 04:55:34 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=824</guid>
		<description><![CDATA[Latent Factor Model，很多人称为SVD，其实是比较伪的SVD，一直是最近今年推荐系统研究的热点。但LFM的研究一直是在评分预测问题上的，很少有人用它去生成TopN推荐的列表，而且也很少有人研究如何将这个数据用到非评分数据上。 本来这本书不准备在实践部分讲这个算法，而只准备在后面介绍学术界研究热点的时候讲这个算法。但后来发现，如果不讲，显得实践部分都是些加减乘除的小把戏，没啥技术含量啊。于是我还是将如何在非评分数据上做LFM放到了实践的部分，当然这方面的相关论文还非常少。不过我觉得LFM在实践部分还是有其前景的。 具体怎么做，先卖个关子不忙说。先公布一个实验结果吧。我们知道，LFM有一个副产品是对物品自动聚类，我今天写书的时候在MovieLens数据集上试了一把，发现效果不错，先公布出来。]]></description>
			<content:encoded><![CDATA[<p>Latent Factor Model，很多人称为SVD，其实是比较伪的SVD，一直是最近今年推荐系统研究的热点。但LFM的研究一直是在评分预测问题上的，很少有人用它去生成TopN推荐的列表，而且也很少有人研究如何将这个数据用到非评分数据上。</p>
<p>本来这本书不准备在实践部分讲这个算法，而只准备在后面介绍学术界研究热点的时候讲这个算法。但后来发现，如果不讲，显得实践部分都是些加减乘除的小把戏，没啥技术含量啊。于是我还是将如何在非评分数据上做LFM放到了实践的部分，当然这方面的相关论文还非常少。不过我觉得LFM在实践部分还是有其前景的。</p>
<p>具体怎么做，先卖个关子不忙说。先公布一个实验结果吧。我们知道，LFM有一个副产品是对物品自动聚类，我今天写书的时候在MovieLens数据集上试了一把，发现效果不错，先公布出来。</p>
<p><img src="http://www.ituring.com.cn/download/01K9z4GVRy7a" /></p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=tXrnPLQr0Uo:SZ81U_mj6OE:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=tXrnPLQr0Uo:SZ81U_mj6OE:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=tXrnPLQr0Uo:SZ81U_mj6OE:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=tXrnPLQr0Uo:SZ81U_mj6OE:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=tXrnPLQr0Uo:SZ81U_mj6OE:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=tXrnPLQr0Uo:SZ81U_mj6OE:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=tXrnPLQr0Uo:SZ81U_mj6OE:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=tXrnPLQr0Uo:SZ81U_mj6OE:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=tXrnPLQr0Uo:SZ81U_mj6OE:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=tXrnPLQr0Uo:SZ81U_mj6OE:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=tXrnPLQr0Uo:SZ81U_mj6OE:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/tXrnPLQr0Uo" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=824</wfw:commentRss>
		<slash:comments>1</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=824</feedburner:origLink></item>
		<item>
		<title>推荐系统实践样章部分公布</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/QBOppRvj1cc/</link>
		<comments>http://xlvector.net/blog/?p=821#comments</comments>
		<pubDate>Tue, 13 Dec 2011 23:54:49 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[book]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=821</guid>
		<description><![CDATA[http://www.ituring.com.cn/article/725 样章很奇怪，是书的中间一章，关于标签推荐的。之所以选这章作为样章，是因为这一章内容相对独立，而且内容不是很多，比较容易写出来。这次样章分批公布，这次公布的是这一章的前言部分，也就是经常被和我一样的广大民工同志们称为废话的一部分。这一章的剩余部分会在未来的几周陆续公布。不过样章中有些实验结果尚未公布，只讨论了方法，结果要等正式出版时公布，留点悬念。 此外，我一开始写这本书的时候很多人都怕写的很晦涩难懂，不过这次我好像矫枉过正，写的过于简单了。同志们多批评。 目前我主要在集中写这一章的前一章，即如何利用隐反馈数据，主要包括neighborhood-based, latent factor model和 graph。]]></description>
			<content:encoded><![CDATA[<p><a href="http://www.ituring.com.cn/article/725">http://www.ituring.com.cn/article/725</a></p>
<p>样章很奇怪，是书的中间一章，关于标签推荐的。之所以选这章作为样章，是因为这一章内容相对独立，而且内容不是很多，比较容易写出来。这次样章分批公布，这次公布的是这一章的前言部分，也就是经常被和我一样的广大民工同志们称为废话的一部分。这一章的剩余部分会在未来的几周陆续公布。不过样章中有些实验结果尚未公布，只讨论了方法，结果要等正式出版时公布，留点悬念。</p>
<p>此外，我一开始写这本书的时候很多人都怕写的很晦涩难懂，不过这次我好像矫枉过正，写的过于简单了。同志们多批评。</p>
<p>目前我主要在集中写这一章的前一章，即如何利用隐反馈数据，主要包括neighborhood-based, latent factor model和 graph。</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=QBOppRvj1cc:P_uGIbrznJ4:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=QBOppRvj1cc:P_uGIbrznJ4:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=QBOppRvj1cc:P_uGIbrznJ4:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=QBOppRvj1cc:P_uGIbrznJ4:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=QBOppRvj1cc:P_uGIbrznJ4:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=QBOppRvj1cc:P_uGIbrznJ4:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=QBOppRvj1cc:P_uGIbrznJ4:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=QBOppRvj1cc:P_uGIbrznJ4:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=QBOppRvj1cc:P_uGIbrznJ4:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=QBOppRvj1cc:P_uGIbrznJ4:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=QBOppRvj1cc:P_uGIbrznJ4:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/QBOppRvj1cc" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=821</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=821</feedburner:origLink></item>
		<item>
		<title>个性化选择RSS源并生成个性化的Feed</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/49EZ-gq9ax8/</link>
		<comments>http://xlvector.net/blog/?p=818#comments</comments>
		<pubDate>Fri, 09 Dec 2011 00:05:59 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=818</guid>
		<description><![CDATA[自从GoogleReader改版之后，现在很难找到一个阅读列表里面大部分文章是自己喜欢的，每天都得点击自己比较喜欢的那些订阅的feed，一个个的看，相当的麻烦。而且，因为没有朋友的分享，很难发现新的feed，只能在自己订阅的feed里找来找去。 于是，我把之前爬下来的google reader数据分析了一下，做了一个工具： http://www.reculike.com/reader.php 打开这个工具，首先看到的是google reader里最热门的feed，你可以选择你喜欢的进行订阅，当选择完一页后，可以点击刷新按钮，他会根据你之前的选择生成新的一屏的个性化的feeds推荐，你可以继续选，每次不满意，就刷新一下。 所有的用户行为都纪录在cookie中，当你想换个兴趣重新找feed的时候，可以点击重置按钮清空之前的历史行为纪录。选完feed后，可以点击生成rss按钮，他会生成一个rss，这个rss包含了所有你之前选的feed的文章的最新100条纪录，你可以在google dreader订阅这个feed。 不过因为服务器性能很土憋，不能保证你订阅的feed的文章实时更新，所以你也可以把找到的feed自己一个个加到阅读器里。 比如下面是一个我选择的技术feed的合并feed http://www.reculike.com/site/reader/myfeed.php?uid=21]]></description>
			<content:encoded><![CDATA[<p>自从GoogleReader改版之后，现在很难找到一个阅读列表里面大部分文章是自己喜欢的，每天都得点击自己比较喜欢的那些订阅的feed，一个个的看，相当的麻烦。而且，因为没有朋友的分享，很难发现新的feed，只能在自己订阅的feed里找来找去。</p>
<p>于是，我把之前爬下来的google reader数据分析了一下，做了一个工具： <a href="http://www.reculike.com/reader.php">http://www.reculike.com/reader.php</a></p>
<p>打开这个工具，首先看到的是google reader里最热门的feed，你可以选择你喜欢的进行订阅，当选择完一页后，可以点击刷新按钮，他会根据你之前的选择生成新的一屏的个性化的feeds推荐，你可以继续选，每次不满意，就刷新一下。</p>
<p>所有的用户行为都纪录在cookie中，当你想换个兴趣重新找feed的时候，可以点击重置按钮清空之前的历史行为纪录。选完feed后，可以点击生成rss按钮，他会生成一个rss，这个rss包含了所有你之前选的feed的文章的最新100条纪录，你可以在google dreader订阅这个feed。</p>
<p>不过因为服务器性能很土憋，不能保证你订阅的feed的文章实时更新，所以你也可以把找到的feed自己一个个加到阅读器里。</p>
<p>比如下面是一个我选择的技术feed的合并feed<br />
<a href="http://www.reculike.com/site/reader/myfeed.php?uid=21">http://www.reculike.com/site/reader/myfeed.php?uid=21</a></p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=49EZ-gq9ax8:vZxZt3WJZJs:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=49EZ-gq9ax8:vZxZt3WJZJs:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=49EZ-gq9ax8:vZxZt3WJZJs:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=49EZ-gq9ax8:vZxZt3WJZJs:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=49EZ-gq9ax8:vZxZt3WJZJs:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=49EZ-gq9ax8:vZxZt3WJZJs:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=49EZ-gq9ax8:vZxZt3WJZJs:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=49EZ-gq9ax8:vZxZt3WJZJs:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=49EZ-gq9ax8:vZxZt3WJZJs:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=49EZ-gq9ax8:vZxZt3WJZJs:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=49EZ-gq9ax8:vZxZt3WJZJs:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/49EZ-gq9ax8" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=818</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=818</feedburner:origLink></item>
		<item>
		<title>Talk at MLA11 : Our solution of KDDCup 2011</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/3LsO98kuTJ4/</link>
		<comments>http://xlvector.net/blog/?p=814#comments</comments>
		<pubDate>Mon, 07 Nov 2011 04:11:25 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>
		<category><![CDATA[algorithms]]></category>
		<category><![CDATA[kddcup]]></category>
		<category><![CDATA[mla11]]></category>
		<category><![CDATA[recommender system]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=814</guid>
		<description><![CDATA[Kddcup2011 View more presentations from Liang Xiang]]></description>
			<content:encoded><![CDATA[<div style="width:425px" id="__ss_10052702"> <strong style="display:block;margin:12px 0 4px"><a href="http://www.slideshare.net/xlvector/kddcup2011" title="Kddcup2011" target="_blank">Kddcup2011</a></strong> <iframe src="http://www.slideshare.net/slideshow/embed_code/10052702" width="425" height="355" frameborder="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>
<div style="padding:5px 0 12px"> View more <a href="http://www.slideshare.net/" target="_blank">presentations</a> from <a href="http://www.slideshare.net/xlvector" target="_blank">Liang Xiang</a> </div>
</p></div>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3LsO98kuTJ4:uOuPXBxHC74:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3LsO98kuTJ4:uOuPXBxHC74:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=3LsO98kuTJ4:uOuPXBxHC74:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3LsO98kuTJ4:uOuPXBxHC74:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=3LsO98kuTJ4:uOuPXBxHC74:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3LsO98kuTJ4:uOuPXBxHC74:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3LsO98kuTJ4:uOuPXBxHC74:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=3LsO98kuTJ4:uOuPXBxHC74:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3LsO98kuTJ4:uOuPXBxHC74:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3LsO98kuTJ4:uOuPXBxHC74:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=3LsO98kuTJ4:uOuPXBxHC74:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/3LsO98kuTJ4" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=814</wfw:commentRss>
		<slash:comments>3</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=814</feedburner:origLink></item>
		<item>
		<title>Hulu’s Recommendation System</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/bekB-B5LLI8/</link>
		<comments>http://xlvector.net/blog/?p=808#comments</comments>
		<pubDate>Tue, 20 Sep 2011 11:36:53 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=808</guid>
		<description><![CDATA[This article comes from Hulu tech blog http://tech.hulu.com/blog/2011/09/19/recommendation-system/ This article is written by zhenghua, lihang and me. Haha As the Internet gets more and more popular, information overload poses an important challenge for a lot of online services. With all of the information pouring out from the web, users can be overwhelmed and confused as to [...]]]></description>
			<content:encoded><![CDATA[<p>This article comes from Hulu tech blog <a href="http://tech.hulu.com/blog/2011/09/19/recommendation-system/">http://tech.hulu.com/blog/2011/09/19/recommendation-system/</a></p>
<p>This article is written by zhenghua, lihang and me. Haha</p>
<p>As the Internet gets more and more popular, information overload poses an important challenge for a lot of online services. With all of the information pouring out from the web, users can be overwhelmed and confused as to what, exactly, they should be paying attention.</p>
<p>A recommendation system provides a solution when a lot of useful content becomes too much of a good thing. A recommendation engine can help users discover information of interest by analyzing historical behaviors. More and more online companies — including Netflix, Google, Facebook, and many others — are integrating a recommendation system into their services to help users discover and select information that may be of particular interest to them.</p>
<p>With literally tens of thousands of hours of premium video content, Hulu users are also prone to content overload. Given the wide variety of content available on the service at any one time, it may be difficult for Hulu users to discover new video that best matches their historic interests. So <strong>the first goal of Hulu’s recommendation system is to help users find content which will be of interest to them.</strong></p>
<p>In addition to users, <strong>Hulu’s recommendation system should also help content owners promote their video</strong>. Part of our mission is to deliver a service that users, advertisers, and content owners all unabashedly love. We have many different content partners, and we understand that these content partners want to more Hulu users to watch their videos — especially when new videos are released. By using personal recommendation instead of more traditional recommendation systems, we can promote video content more effectively since we will promote directly to users who are likely to enjoy the content we are recommending.</p>
<h4>Data Characteristics</h4>
<p>Before explaining the design of our recommendation system, we wanted to explain some parameters within our data.</p>
<p>Since a lot of our content is comprised of episodes or clips within a show, we have decided to recommend shows to users instead of individual videos. Shows are a good method of organization, and videos in the same show are usually very closely related.</p>
<p>Our content can be mainly divided into two parts: on-air shows and library shows. On-air shows are highly important since more than half of our streaming comes from them.</p>
<p>Although on-air shows occupy a large part of our content, they are touched by a seasonal effect. During summer months, most of on-air shows do not air, causing on-air show streaming to decrease. Furthermore, there are fewer shows aired during weekends, thus the streaming of library shows will increase. Keeping this information in mind we can design the recommendation system to recommend more library shows to users during the weekend or summer months, as an example.</p>
<p>The key data that drives most recommendation systems is user behavior data. There are two main types of user behavior data: implicit user feedback data and explicit user feedback data. Explicit user feedback data primarily includes user voting data. Implicit feedback data includes information on users watching, browsing, searching, etc. Explicit feedback data can show a user’s preference on a show explicitly, but implicit feedback data cannot. For example, if a user gives a 5-star rating to a show, we know that this user likes the show very much. But if a user only watches a video from a show page or searches for a show, we don’t know whether this user likes the show.</p>
<p>As the quantity of implicit data at Hulu far outweighs the amount of explicit feedback, our system should be designed primarily to work with implicit feedback data.</p>
<h4>Architecture</h4>
<p>There are many different types of recommendation algorithms, and perhaps the most famous algorithm is collaborative filtering (CF). CF relies on user behavior data, and its main idea is to predict user preferences by analyzing their behaviors. There are two types of CF methods: user-based CF (UserCF) and item-based CF (ItemCF). UserCF assumes that a user will prefer items which are liked by other users who have similar preferences to that user. ItemCF assumes that a user will prefer items similar to the assets he or she preferred previously. ItemCF is widely used by many others (for example, Amazon and Netflix), as it has two main advantages. Firstly, it is suitable for sites where there are a lot more users than items. This allows ItemCF to easily explain recommendations given users’ historical behaviors. For example, if you have watched “Family Guy” on Hulu, we will recommend “American Dad” to you and tell you that we recommend this because you have watched “Family Guy”. So we use ItemCF as our basic recommendation algorithm in Hulu.</p>
<h4>On-line Architecture</h4>
<p>Figure 1 shows our on-line architecture of the recommendation system. This system contains 5 main modules:</p>
<ol>
<li><strong>User profile builder:</strong> When a user first comes into the recommendation system, we will first build a profile for them. The profile includes the user’s historical behaviors and topics, and these are generated from their old behaviors. Users can have many different types of behaviors. For example, they can watch videos, add shows to favorites, search for videos and vote on videos and shows. All these behaviors are all considered by our system and, after extracting all these behaviors, we use a topic model which is trained offline to generate users’ preference on topics.</li>
<li><strong>Recommendation Core:</strong> After generating the list of user’s historical preferences on shows and topics, we put all of those similar shows into raw recommendations.</li>
<li><strong>Filtering:</strong> For some pretty obvious reasons, raw recommendation results cannot be presented to users directly. We need to filter out shows the user has already seen or engaged with, so we can increase the recommendations shows a little more precise.</li>
<li><strong>Ranking:</strong> The ranking module will re-rank raw recommendations to make them better fit users preferences. First, we’ll make recommendation more diverse. Then we’ll increase novelty of recommendations so that users will find shows they like, but have never seen before.</li>
<li><strong>Explanation:</strong>Explanation is one of the most important components of every recommendation system. The explanation module generates some reasoning for every recommendation result using the user’s historical behaviors. For example, we will recommend “American Dad” to a user who had previously watched “Family Guy.” The explanation will say, “We recommend ‘American Dad’ to you because you have watched ‘Family Guy’”.
<div id="attachment_125"><a href="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-1.jpg"><img src="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-1.jpg" alt="" width="527" height="337" /></a>Figure 1 : Architecture for Hulu&nbsp;</p>
</div>
<h4>Off-line Architecture</h4>
<p>In the above on-line architecture, some components rely on offline resources, such as the topic model, related model, feedback model, etc. The off-line system is also an important part of our recommendation system. Our off-line system has these main components:</p>
<ol>
<li><strong>Data Center:</strong> The data center contains all user behavior data in Hulu. Some of them are stored in Hadoop clusters and some of them are stored in a relational database.</li>
<li><strong>Related Table Generator:</strong> The related table is an important resource for on-line recommendation. We use two main types of related table: one that’s based on collaborative filtering (which we’ll call CF), and another based on content. In CF, show A and show B will have high similarity if users who like show A also like show B. With content filtering, we use content information including title, description, channel, company, actor/actress, and tags.</li>
<li><strong>Topic Model:</strong> A topic is represented by a group of shows that have similar content. Topics are thus larger in scope than shows, but they’re still smaller than channels. Our topics are learned by LDA, which is a popular topic model in machine learning.</li>
<li><strong>Feedback Analyzer:</strong> Feedback specifically means users’ reactions to recommendation results. Using user feedback can improve recommendation quality. For example, say a show is recommended to many users, but most of them do not click this show. In that case, we’ll decrease the rank of this show. Users will also have different types of behavior, so we’ll use all these behaviors in developing the recommendations. However, some users may prefer recommendations to come from their prior watch history, and some users may prefer their recommendations to come from their voting behavior. All these effects can be modeled offline by analyzing users’ feedback on their recommendations.</li>
<li><strong>Report Generator:</strong> Evaluation is most important part of the recommendation system. The report generator will generate a report including multiple metrics every day to show the quality of recommendations. At Hulu we monitor metrics including CTR, conversion ratio, etc.</li>
</ol>
<p>&nbsp;</p>
<div id="attachment_128"><a href="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-2.jpg"><img src="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-2.jpg" alt="" width="585" height="325" /></a>Figure 2 : Architecture for Hulu&nbsp;</p>
</div>
<p>&nbsp;</p>
<h4>Algorithms</h4>
<p>So far, we’ve given a brief overview of our recommendation architecture. From previous discussion, we can see that Hulu’s recommendation system is primarily based on ItemCF. We’ve added many improvements on top of the ItemCF algorithm, too, in order to make it generate better recommendations. To test these improvements, we’ve performed many A/B tests on different algorithms. In following sections, we’ll introduce some of these algorithms and the experiment results.</p>
<h4>Item-based Collaborative Filtering</h4>
<p>Item-based Collaborative Filtering (ItemCF) is the basis of all our algorithms. In ItemCF, let N(u) be a set of items user u has preferred previously. User u’s preference on item j (j is not in N(u)) can then be measured by:</p>
<p>p(u,i) = \sum_{j \in N(u)} r(u,j) s(i,j)</p>
<p>Here, r(u,i) is the preference weight of user u on show i, and s(i,j) is the similarity between show i and show j. In CF, the similarity between two shows is calculated by user behavior data on these two shows. Let N(i) be a set of users who watched show i and N(j) be a set of users who watched show j. Then, the similarity s(i,j) between show i and show j is calculated by following formula:</p>
<p><img title="s(i,j)=\frac{\left | N(i)\cap N(j) \right |}{\sqrt{\left | N(i) \parallel N(j) \right |}}" src="http://tech.hulu.com/blog/wp-content/latex/ae2/ae2e6d3b05921c32d650c484d6438942-ffffff-000000-2.png" alt="s(i,j)=\frac{\left | N(i)\cap N(j) \right |}{\sqrt{\left | N(i) \parallel N(j) \right |}}" />In this definition, show i will be highly relevant to show j if most users who watch show i will also watch show j. However, this definition will have the “Harry Potter problem,” which means that every show will have high relevance with popular shows.</p>
<h4>Recent Behavior</h4>
<p>The first lesson we learned from A/B testing is that recommendations should fit users’ recent preference and that users’ recent behavior is more important than their older, historical behaviors. So, in our engine, we will put more weight on users’ recent behaviors. In our system, CTR of recommendations that originate from users’ recent watch behavior is 1.8 times higher than CTR of recommendations originating from users’ old watch behavior.</p>
<p>&nbsp;</p>
<p><a href="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-5.jpg"><img title="graphic-5" src="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-5.jpg" alt="" width="344" height="290" /></a></p>
<p>&nbsp;</p>
<h4>Novelty</h4>
<p>Just because a recommendation system can accurately predict user behavior does not mean it produces a show that you want to recommend to an active user. For example, “Family Guy” is a very popular show on Hulu, and thus most users have watched at least some episodes from this show. These  users do not need us to recommend this show to them — the show is popular enough that users will decide whether or not to watch it by themselves.</p>
<p>Thus, novelty is also an important metric to evaluate recommendations. The first way we think can increase novelty is by revising ItemCF algorithm:</p>
<ol>
<li>First, we will decrease weight of popular shows that users have watched before.</li>
<li>Then, we’ll put more weight on shows that are not only similar to shows the active user watched before, but also less popular than shows the active user watched before.</li>
</ol>
<h4>Explanation-based Diversity</h4>
<p>Most users have diverse preferences, so the recommendation should also meet their diverse interests. In our system, we use explanations to diversify our recommendations. We think a diverse recommendation means most of the recommendation shows have different explanations.</p>
<p>We have performed an A/B test to show the usefulness of diversification (shown in the above figure). The results of the experiment show that, for active users who had previously watched 10 or more shows, diversification can increase recommendation CTR significantly.</p>
<h4>Temporal Diversity</h4>
<p>A good recommendation system should not generate static recommendations. Users want to see new suggestions every time they visit the recommendation system. If a user has new behaviors, she will find her recommendations have changed because we have put more weight on the user’s recent behaviors. But if a user has no new behaviors, we also need to change our recommendations. We use three methods to keep temporal diversity of our system:</p>
<ol>
<li>First, we’ll recommend recently-added shows to users. Many new shows are added to Hulu every day, and we will suggest these shows to users who will like them. Thus, users will see fresh ideas for shows to watch when new ones are added.</li>
<li>Second, we will randomize our recommendations. Randomization is the simplest way to keep recommendations fresh.</li>
<li>Finally, we’ll decrease rank of recommendations which users have seen many times. This is called implicit feedback, and data show that CTR is increased by 10% after using this method.</li>
</ol>
<h4>Performance of Hulu’s Recommendation Hub</h4>
<p>The recommendation hub is a personal recommendation page for every user. On this page users will see 6 carousels. The top carousel is “top recommendations”, which includes shows that we think users will prefer very much. After top recommendations, there are three carousels for three genres. These three genres are selected by analyzing users’ historical preferences. The next carousel is bookmarks, which include shows that users have indicated they’d like to watch later. The last carousel is filled with shows that the user has already rated. This carousel is designed to collect more explicit feedback from users.</p>
<p><a href="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-6.jpg"><img title="graphic-6" src="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-6.jpg" alt="" width="513" height="344" /></a></p>
<p>We have performed an A/B test to compare our recommendation algorithms with two simple recommendation algorithms: Most Popular (which recommends the most popular shows to every user) and Highest Rated (which recommends highly-rated shows to every user). As shown in the above figure, experiment results show that the CTR of our algorithm is much higher than both simple methods.</p>
<h4>Lessons</h4>
<p><em>Every user behavior can reflect user preferences.</em></p>
<p>In our system, we use a slew of user behaviors to come up with our recommendations. We’ve calculated the CTR of recommendations originating from different types of behaviors. As shown in Figure 3, we can see that recommendations from every type of behavior can generate recommendations that will be clicked by users.</p>
<div id="attachment_134"><a href="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-7.jpg"><img src="http://tech.hulu.com/blog/wp-content/uploads/2011/09/graphic-7.jpg" alt="" width="582" height="316" /></a>Figure 3 : CTR of recommendations come from different types of behaviors&nbsp;</p>
</div>
<p><em>Explicit Feedback data is more important than implicit feedback data</em></p>
<p>As shown in Figure 3, CTR of recommendations that originate from users’ historically loved (vote 5 stars on shows) and liked (vote 4 stars on shows) behaviors is higher than CTR of recommendations that come from users’ historical subscribe/watch/search behavior. So although the size our explicit feedback data is much smaller than implicit feedback data, they’re much more important.</p>
<p><em>Recent behaviors are much more important than old behaviors</em></p>
<p><em>Novelty, Diversity, and offline Accuracy are all important factors</em></p>
<p>Most researchers focus on improving offline accuracy, such as RMSE, precision/recall. However, recommendation systems that can accurately predict user behavior alone may not be a good enough for practical use. A good recommendation system should consider multiple factors together. In our system, after considering novelty and diversity, the CTR has improved by more than 10%.</p>
<blockquote><p><em>Based on the paper “Recommendation System at Hulu” by Liang Xiang, Hua Zheng and Hang Li.<br />
Hua Zheng is the senior lead developer in charge of the Hulu content recommendation and behavior targeting systems.<br />
Dr. Xiang and Dr. Li, associate researchers, are working together on the recommendation system, helping users discover and enjoy relevant premium videos.</em></p></blockquote>
</li>
</ol>
<p>&nbsp;</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=bekB-B5LLI8:Gw8iMpLnpA8:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=bekB-B5LLI8:Gw8iMpLnpA8:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=bekB-B5LLI8:Gw8iMpLnpA8:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=bekB-B5LLI8:Gw8iMpLnpA8:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=bekB-B5LLI8:Gw8iMpLnpA8:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=bekB-B5LLI8:Gw8iMpLnpA8:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=bekB-B5LLI8:Gw8iMpLnpA8:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=bekB-B5LLI8:Gw8iMpLnpA8:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=bekB-B5LLI8:Gw8iMpLnpA8:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=bekB-B5LLI8:Gw8iMpLnpA8:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=bekB-B5LLI8:Gw8iMpLnpA8:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/bekB-B5LLI8" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=808</wfw:commentRss>
		<slash:comments>5</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=808</feedburner:origLink></item>
		<item>
		<title>推荐系统的有效性——Amazon到底是百分之多少</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/3cK55wBBros/</link>
		<comments>http://xlvector.net/blog/?p=802#comments</comments>
		<pubDate>Tue, 20 Sep 2011 06:19:51 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=802</guid>
		<description><![CDATA[Amazon作为推荐系统的老大(King of recommender system)，关于推荐系统对amazon究竟起了多大的作用，一直广受学术界和工业界的关注，而各方面的数字也很多。我发挥了考据学的精神，把这些数字都考据出来，当然我也不知道什么是真的。 长尾理论一书的作者虽然没有对推荐系统的作用做出估计，但估计了长尾内容对Amazon销售额的贡献比例。Anderson对这个数字做过两次估计。第一次是和MIT的研究团队一起估计出57%的销售来自长尾。这个数字的基础是Amazon曾经公布过在2001年到2003年排名前100000的商品占了39.2%的销售额。但在后来的更精确的估计中他提出这个比例应该在25%到36%之间。 Amazon的前科学家Greg Linden在他们的blog中也讨论了推荐系统在Amazon中的作用，他提到在他离开时，推荐系统对Amazon的贡献额在20%左右： Personalization was responsible for well more than 20% of sales when I left Amazon in 2002. 此外，Amazon的一位科学家曾经在斯坦福讲推荐系统的课，一位听了他的课的同学在自己的blog里提到20% &#8211; 30% 的销售额来自于推荐系统。 从上面的考据可以基本判定，推荐系统对Amazon的销售额的贡献在20%到30%之间。 更新 ： 9月21号Greg Linden又发表了一篇blog提到了这个问题，里面给出的数字是35%。然后他引用的是这篇文章。这篇文章中有下面一段话： Amazon says 35 percent of product sales result from recommendations. 不过文章并没有给出这句话的来源。而评论里也有群众提到了这个问题： Nice write-up, Matt. From where did you get “Amazon says 35 percent [...]]]></description>
			<content:encoded><![CDATA[<p>Amazon作为推荐系统的老大(<a href="http://www.readwriteweb.com/archives/recommender_systems.php">King of recommender system</a>)，关于推荐系统对amazon究竟起了多大的作用，一直广受学术界和工业界的关注，而各方面的数字也很多。我发挥了考据学的精神，把这些数字都考据出来，当然我也不知道什么是真的。</p>
<p>长尾理论一书的作者虽然没有对推荐系统的作用做出估计，但估计了长尾内容对Amazon销售额的贡献比例。Anderson对这个数字做过两次估计。第一次是和MIT的研究团队一起估计出<a href="http://longtail.typepad.com/the_long_tail/2005/08/a_methodology_f.html">57%</a>的销售来自长尾。这个数字的基础是Amazon曾经公布过在2001年到2003年排名前100000的商品占了39.2%的销售额。但在后来的更精确的估计中他提出这个比例应该在<a href="http://www.longtail.com/the_long_tail/2005/08/the_8020_rule_r.html">25%到36%</a>之间。</p>
<p>Amazon的前科学家Greg Linden在他们的<a href="http://glinden.blogspot.com/2006/12/35-of-sales-from-recommendations.html">blog</a>中也讨论了推荐系统在Amazon中的作用，他提到在他离开时，推荐系统对Amazon的贡献额在20%左右：</p>
<p><strong><em>Personalization was responsible for well more than 20% of sales when I left Amazon in 2002.</em></strong></p>
<p>此外，Amazon的一位科学家曾经在斯坦福讲推荐系统的课，一位听了他的课的同学在自己的<a href="http://blog.kiwitobes.com/?p=58">blog</a>里提到20% &#8211; 30% 的销售额来自于推荐系统。</p>
<p>从上面的考据可以基本判定，推荐系统对Amazon的销售额的贡献在20%到30%之间。</p>
<p>更新 ：</p>
<p>9月21号Greg Linden又发表了一篇<a href="http://glinden.blogspot.com/2011/09/quick-links.html">blog</a>提到了这个问题，里面给出的数字是35%。然后他引用的是<a href="http://venturebeat.com/2006/12/10/aggregate-knowledge-raises-5m-from-kleiner-on-a-roll/">这篇文章</a>。这篇文章中有下面一段话：</p>
<p>Amazon says 35 percent of product sales result from recommendations.</p>
<p>不过文章并没有给出这句话的来源。而评论里也有群众提到了这个问题：</p>
<p>Nice write-up, Matt. From where did you get “Amazon says 35 percent of product sales result from recommendations”? </p>
<p>所以，20%到30%应该还是差不多靠谱的数字。但这个数字还是很高了。</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3cK55wBBros:T1KnWlv2Aco:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3cK55wBBros:T1KnWlv2Aco:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=3cK55wBBros:T1KnWlv2Aco:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3cK55wBBros:T1KnWlv2Aco:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=3cK55wBBros:T1KnWlv2Aco:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3cK55wBBros:T1KnWlv2Aco:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3cK55wBBros:T1KnWlv2Aco:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=3cK55wBBros:T1KnWlv2Aco:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3cK55wBBros:T1KnWlv2Aco:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=3cK55wBBros:T1KnWlv2Aco:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=3cK55wBBros:T1KnWlv2Aco:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/3cK55wBBros" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=802</wfw:commentRss>
		<slash:comments>2</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=802</feedburner:origLink></item>
		<item>
		<title>推荐系统有效性—— Digg 40%的提升</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/u5I-Kly9k5Q/</link>
		<comments>http://xlvector.net/blog/?p=799#comments</comments>
		<pubDate>Sat, 10 Sep 2011 13:14:07 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>
		<category><![CDATA[effective]]></category>
		<category><![CDATA[recommender system]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=799</guid>
		<description><![CDATA[最近准备写一个系列，是关于各个公司用了推荐系统后的关于推荐系统究竟有没有给他们带来利益的报告。所以称为推荐系统有效性系列。 今天的例子是Digg。Digg在2008年有一篇官方的blog，可以从下面的地址看到 http://about.digg.com/blog/digg-recommendation-engine-updates Blog的主要意思就是说，Digg在那个时候发布了他们的新的推荐算法，然后他们经过了1个月的测试，测试到了一系列数据。他们觉得他们的算法NB了，于是贴出来炫耀一下。下面是一些他们提高的指标 1. Digg行为的活跃度获得了明显的提高，新算法发布后，每天的用户Digg总数提高了40% 2.推荐系统的影响越来越大，平均每个有digg行为的用户每天会获得200个推荐结果，这些结果来自和他们有相似兴趣的其他digger。由此可以看到Digg的推荐算法是类似于基于用户的协同过滤算法。统计结果显示，平均每个活跃用户会有34个和他们兴趣相似的digger。 3. 用户好友数增加了24%。 4. 用户的评论数增加了11%。 看完上面的数据，大家肯定热血沸腾了，但大家肯定困惑，Digg是通过什么算法取得这个效果的。不用担心，关于Digg的推荐系统，在2008年还有一篇文章 http://www.technologyreview.com/Infotech/21045/page1/ 这篇文章详细讨论了背后的算法。 会英文的同学可以很容易的读懂上面的文章，但我在这里还是想总结一下。 1. Digg的算法和Amazon不同，他不是给用户推荐和他们的历史行为相关的物品，而是更加依赖于集体智能，给用户推荐和他们兴趣相似的用户喜欢的文章。换句学术的话，Digg用的是UserCF算法，而不是ItemCF算法 2. Digg一开始是让用户提交他们喜欢的文章的链接，如果别的用户喜欢这个文章，可以顶一下，不喜欢可以踩一下。而原先digg的首页展示了被顶的最多的热门文章。因此，在Digg的系统中，热门度是文章很重要的属性。UserCF的算法可以在提供个性化的同时保证热门度，而很多基于Item的算法不能保证这一点，这也是Digg选择UserCF的一个原因。 3. 系统在利用UserCF的同时，也考虑到了主题(Topic)的影响。Digg认为，如果两个人digg了很多同样的体育文章，不代表他们的政治观点也是一致的。因此，他们会计算不同topic中的用户的兴趣相似度。也就是说用户在体育领域兴趣相似，不会扩展到政治领域。当然，digg的topic都是很大的topic，所以并不会太多的影响推荐结果的多样性。 4. 我们知道，UserCF在计算用户的兴趣相似度时，两个用户有相似的兴趣是因为他们共同看过同一篇文章，但如果这样的话，一个用户看过一篇热门的文章，就会和很多用户产生相似度。Digg注意到了这个问题，并尽量去除了这种影响。]]></description>
			<content:encoded><![CDATA[<p>最近准备写一个系列，是关于各个公司用了推荐系统后的关于推荐系统究竟有没有给他们带来利益的报告。所以称为推荐系统有效性系列。</p>
<p>今天的例子是Digg。Digg在2008年有一篇官方的blog，可以从下面的地址看到 <a href="http://about.digg.com/blog/digg-recommendation-engine-updates">http://about.digg.com/blog/digg-recommendation-engine-updates</a></p>
<p>Blog的主要意思就是说，Digg在那个时候发布了他们的新的推荐算法，然后他们经过了1个月的测试，测试到了一系列数据。他们觉得他们的算法NB了，于是贴出来炫耀一下。下面是一些他们提高的指标</p>
<p>1. Digg行为的活跃度获得了明显的提高，新算法发布后，每天的用户Digg总数提高了40%</p>
<p>2.推荐系统的影响越来越大，平均每个有digg行为的用户每天会获得200个推荐结果，这些结果来自和他们有相似兴趣的其他digger。由此可以看到Digg的推荐算法是类似于基于用户的协同过滤算法。统计结果显示，平均每个活跃用户会有34个和他们兴趣相似的digger。</p>
<p>3. 用户好友数增加了24%。</p>
<p>4. 用户的评论数增加了11%。</p>
<p>看完上面的数据，大家肯定热血沸腾了，但大家肯定困惑，Digg是通过什么算法取得这个效果的。不用担心，关于Digg的推荐系统，在2008年还有一篇文章 <a href="http://www.technologyreview.com/Infotech/21045/page1/">http://www.technologyreview.com/Infotech/21045/page1/</a> 这篇文章详细讨论了背后的算法。 会英文的同学可以很容易的读懂上面的文章，但我在这里还是想总结一下。</p>
<p>1. Digg的算法和Amazon不同，他不是给用户推荐和他们的历史行为相关的物品，而是更加依赖于集体智能，给用户推荐和他们兴趣相似的用户喜欢的文章。换句学术的话，Digg用的是UserCF算法，而不是ItemCF算法</p>
<p>2. Digg一开始是让用户提交他们喜欢的文章的链接，如果别的用户喜欢这个文章，可以顶一下，不喜欢可以踩一下。而原先digg的首页展示了被顶的最多的热门文章。因此，在Digg的系统中，热门度是文章很重要的属性。UserCF的算法可以在提供个性化的同时保证热门度，而很多基于Item的算法不能保证这一点，这也是Digg选择UserCF的一个原因。</p>
<p>3. 系统在利用UserCF的同时，也考虑到了主题(Topic)的影响。Digg认为，如果两个人digg了很多同样的体育文章，不代表他们的政治观点也是一致的。因此，他们会计算不同topic中的用户的兴趣相似度。也就是说用户在体育领域兴趣相似，不会扩展到政治领域。当然，digg的topic都是很大的topic，所以并不会太多的影响推荐结果的多样性。</p>
<p>4. 我们知道，UserCF在计算用户的兴趣相似度时，两个用户有相似的兴趣是因为他们共同看过同一篇文章，但如果这样的话，一个用户看过一篇热门的文章，就会和很多用户产生相似度。Digg注意到了这个问题，并尽量去除了这种影响。</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=u5I-Kly9k5Q:LqdV6YOODCU:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=u5I-Kly9k5Q:LqdV6YOODCU:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=u5I-Kly9k5Q:LqdV6YOODCU:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=u5I-Kly9k5Q:LqdV6YOODCU:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=u5I-Kly9k5Q:LqdV6YOODCU:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=u5I-Kly9k5Q:LqdV6YOODCU:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=u5I-Kly9k5Q:LqdV6YOODCU:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=u5I-Kly9k5Q:LqdV6YOODCU:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=u5I-Kly9k5Q:LqdV6YOODCU:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=u5I-Kly9k5Q:LqdV6YOODCU:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=u5I-Kly9k5Q:LqdV6YOODCU:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/u5I-Kly9k5Q" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=799</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=799</feedburner:origLink></item>
		<item>
		<title>reculike的几点改动</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/H48w7JdrDsQ/</link>
		<comments>http://xlvector.net/blog/?p=796#comments</comments>
		<pubDate>Mon, 29 Aug 2011 14:55:28 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>
		<category><![CDATA[reculike]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=796</guid>
		<description><![CDATA[reculike.com 上线后感谢大家的支持，有了一定的访问量。不过目前的访问量还不支持能够算出好的推荐结果，所以希望大家能多反馈。 最近对reculike做了一些改动，总结如下 1. 用户的主要显性反馈为两种。每篇paper下面可以让用户bookmark，表示用户对这篇paper感兴趣，准备记录下来，以后有时间仔细研究。另外，在paper的页面，用户可以recommend一篇paper，表示用户觉得自己对这篇文章很熟悉，觉得很好，希望推荐给别人。目前，用户如果要recommend文章，就一定要写推荐语。 这两种行为代表了一种专家和普通用户的互动。今后在这方面还有一些后续的功能。比如，一个普通用户可能bookmark一篇文章，表示他对文章有兴趣，那么这个时候，如果有专家recommend这篇文章，系统就会在首页上告诉这个用户有专家recommend这篇文章了，那么如果这个用户对这篇文章有疑问，可以向这位专家请教。因此，可以通过paper来联系用户，实现用户的互动。 2. 在首页显示了用户的bookmark过的paper，用户recommend过的paper，和系统给用户的推荐paper。默认显示推荐的paper，但用户可以通过点击上面的链接来切换不同的paper列表。 目前系统还很粗糙，欢迎大家使用。有什么问题可以在sina微博上 @xlvector]]></description>
			<content:encoded><![CDATA[<p><a href="http://www.reculike.com">reculike.com</a> 上线后感谢大家的支持，有了一定的访问量。不过目前的访问量还不支持能够算出好的推荐结果，所以希望大家能多反馈。</p>
<p>最近对reculike做了一些改动，总结如下</p>
<p>1. 用户的主要显性反馈为两种。每篇paper下面可以让用户bookmark，表示用户对这篇paper感兴趣，准备记录下来，以后有时间仔细研究。另外，在paper的页面，用户可以recommend一篇paper，表示用户觉得自己对这篇文章很熟悉，觉得很好，希望推荐给别人。目前，用户如果要recommend文章，就一定要写推荐语。</p>
<p>这两种行为代表了一种专家和普通用户的互动。今后在这方面还有一些后续的功能。比如，一个普通用户可能bookmark一篇文章，表示他对文章有兴趣，那么这个时候，如果有专家recommend这篇文章，系统就会在首页上告诉这个用户有专家recommend这篇文章了，那么如果这个用户对这篇文章有疑问，可以向这位专家请教。因此，可以通过paper来联系用户，实现用户的互动。</p>
<p>2. 在首页显示了用户的bookmark过的paper，用户recommend过的paper，和系统给用户的推荐paper。默认显示推荐的paper，但用户可以通过点击上面的链接来切换不同的paper列表。</p>
<p>目前系统还很粗糙，欢迎大家使用。有什么问题可以在sina微博上 @xlvector</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=H48w7JdrDsQ:0kkdoTZ8seM:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=H48w7JdrDsQ:0kkdoTZ8seM:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=H48w7JdrDsQ:0kkdoTZ8seM:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=H48w7JdrDsQ:0kkdoTZ8seM:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=H48w7JdrDsQ:0kkdoTZ8seM:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=H48w7JdrDsQ:0kkdoTZ8seM:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=H48w7JdrDsQ:0kkdoTZ8seM:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=H48w7JdrDsQ:0kkdoTZ8seM:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=H48w7JdrDsQ:0kkdoTZ8seM:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=H48w7JdrDsQ:0kkdoTZ8seM:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=H48w7JdrDsQ:0kkdoTZ8seM:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/H48w7JdrDsQ" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=796</wfw:commentRss>
		<slash:comments>0</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=796</feedburner:origLink></item>
		<item>
		<title>RecULike 论文推荐系统初步上线</title>
		<link>http://feedproxy.google.com/~r/blogspot/SHpi/~3/xAXOUVLnKfY/</link>
		<comments>http://xlvector.net/blog/?p=777#comments</comments>
		<pubDate>Sun, 21 Aug 2011 05:14:29 +0000</pubDate>
		<dc:creator>xlvector</dc:creator>
				<category><![CDATA[未分类]]></category>

		<guid isPermaLink="false">http://xlvector.net/blog/?p=777</guid>
		<description><![CDATA[我们开发的论文推荐系统RecULike (http://www.reculike.com) 已经初步上线，不过目前还有很多bug，但基本能用，还在不断的改善中。 该系统是一个开源项目，他的源代码可以从下面获取 http://code.google.com/p/paperlens/ 该项目的主要贡献者是 WangXing 和 GuoJing]]></description>
			<content:encoded><![CDATA[<p>我们开发的论文推荐系统RecULike (<a href="http://www.reculike.com">http://www.reculike.com</a>) 已经初步上线，不过目前还有很多bug，但基本能用，还在不断的改善中。</p>
<p>该系统是一个开源项目，他的源代码可以从下面获取</p>
<p><a href="http://code.google.com/p/paperlens/">http://code.google.com/p/paperlens/</a></p>
<p>该项目的主要贡献者是</p>
<p>WangXing 和 GuoJing</p>
<div class="feedflare">
<a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=xAXOUVLnKfY:i9s31VcOGA4:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=xAXOUVLnKfY:i9s31VcOGA4:-BTjWOF_DHI"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=xAXOUVLnKfY:i9s31VcOGA4:-BTjWOF_DHI" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=xAXOUVLnKfY:i9s31VcOGA4:D7DqB2pKExk"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=xAXOUVLnKfY:i9s31VcOGA4:D7DqB2pKExk" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=xAXOUVLnKfY:i9s31VcOGA4:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=xAXOUVLnKfY:i9s31VcOGA4:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=xAXOUVLnKfY:i9s31VcOGA4:V_sGLiPBpWU" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=xAXOUVLnKfY:i9s31VcOGA4:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?d=qj6IDK7rITs" border="0"></img></a> <a href="http://feeds.feedburner.com/~ff/blogspot/SHpi?a=xAXOUVLnKfY:i9s31VcOGA4:gIN9vFwOqvQ"><img src="http://feeds.feedburner.com/~ff/blogspot/SHpi?i=xAXOUVLnKfY:i9s31VcOGA4:gIN9vFwOqvQ" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/blogspot/SHpi/~4/xAXOUVLnKfY" height="1" width="1"/>]]></content:encoded>
			<wfw:commentRss>http://xlvector.net/blog/?feed=rss2&amp;p=777</wfw:commentRss>
		<slash:comments>7</slash:comments>
		<feedburner:origLink>http://xlvector.net/blog/?p=777</feedburner:origLink></item>
	</channel>
</rss>

