#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
talkindex.py v2.19 by [[zh:user:Shizhao]]
"""
#
# (C) Shizhao, 2008
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: talkindex.py,v 2.16 2008-03-28 Shizhao $'
#
import os
import sys
import urllib
import re, time, datetime
import wikipedia, config, cosmetic_changes, catlib
import xml.parsers.expat
site = wikipedia.getSite()
FAcat=catlib.Category(site, u'Category:特色条目讨论')
FAcats=FAcat.articlesList()
GAcat=catlib.Category(site, u'Category:优良条目讨论')
GAcats=GAcat.articlesList()
FALcat=catlib.Category(site, u'Category:特色列表讨论')
FALcats=FALcat.articlesList()
def GetPage(rcstart,rcend, rcns):
#"""从最近更改API获取XML正文"""
baseurl = 'http://zh.wikipedia.org/w/api.php?action=query&list=recentchanges&rcstart=%s&rcend=%s&rcnamespace=%s&rcprop=title|user|timestamp|comment&rclimit=500&format=xml'
uo=wikipedia.MyURLopener()
url = baseurl % (rcstart, rcend, rcns)
u = uo.open(url)
wikipedia.output(u'URL:\n%s' % url)
return u.read()
tempList = []
titleList=[]
talkList=[]
tlist=[]
tulist=[]
talkformat='''/* (.*?) \*/'''
n=0
template='''* {{talkindex|title=%s|user=%s|talk=%s|time=%s|indiscussion=%s|hot=%s|fg=%s|nomain=%s|disambig=%s|redirect=%s|empty=%s|protect=%s|nowcommons=%s|talkprotect=%s}}\n'''
#分析xml数据,得到所有内容的列表
def listelement(name, attrs):
if name == 'rc':
try:
title=attrs[u'''title''']
user=attrs[u'''user''']
talk=attrs[u'''comment''']
timestamp=attrs[u'''timestamp''']
try:
#排除重复项目
talk=re.search(talkformat, talk).groups()[0]
t=title+'#'+talk
tu=t+user
tlist.append(t)
tulist.append(tu)
except AttributeError:
return
except KeyError:
return
else:
return
def remove_dups(lst):
""" Removes duplicate elements from list. Drawbacks:
- Returns an unsorted list.
- Does not work with lists, dicts etc. as list elements.
"""
dick = {}
for item in lst:
dick[item] = None
return dick.keys()
#得到用户数量与编辑次数
def usersum(t):
tu1list=[]
for tt1, tu1 in zip(tlist, tulist):
x=[elem for elem in tlist if (tlist.count(elem) > 1) and (elem==t)]
if x.count(tt1)>1:
tu1list.append(tu1)
nuser=len(remove_dups(tu1list))
ntalk=len(tu1list)
x=[]
return nuser,ntalk
#标记热点讨论
def bighot(t, title):
nedit=usersum(t)
if nedit[0]>1 and nedit[1]>2:
hot='yes'
wikipedia.output('"Hot!" In [[%s]] have %s users (%s edits) Talking......!' % (title, nedit[0], nedit[1]))
else:
hot='no'
return hot
#添加 {{indiscussion}} 模板到对话页
def IndiscussionAuto(t, title, talk, days, i):
nedit=usersum(t)
print nedit, i
if nedit[0]>1 and nedit[1] >5 and i==None:
pg=wikipedia.Page(site,title)
text = pg.get()
# for level in range(1, 7):
# equals = '=' * level
# text = wikipedia.replaceExcept(text, r'\n' + '==' + ' *(?P<title>[^=]+?) *' + '==' + ' *\r\n', '\n' + '==' + ' \g<title> ' + '==' + '\r\n', ['comment', 'math', 'nowiki', 'pre'])
# wikipedia.output(text)
text = re.sub('=='+' '+talk+' '+'=='+'\n', '=='+' '+talk+' '+ '=='+ '\n{{indiscussion|'+str(time.gmtime()[1])+'}}'+'\n',text,re.I)
pg.put(text, u'Bot添加 {{indiscussion}} 模板,最近%s天有%s位用户正在讨论“[[%s|%s]]”话题,已经编辑%s次' %(days,nedit[0], t,talk, nedit[1] ))
wikipedia.output(u'flag {{indiscussion}}: %s' % talk)
# wikipedia.output(text)
#更新{{CurrentDiscussion}} 模板
def CurrentDiscussion():
template = 'indiscussion'
regex ='%s' % template
s=wikipedia.Page(site, 'Template:%s' % template)
pages = [page for page in s.getReferences(onlyTemplateInclusion=True)]
t=wikipedia.Page(site, 'Template:CurrentDiscussion')
c= u"<font color=red>%s</font>项<noinclude>\n----\n参见[[:Category:進行中的討論|進行中的討論]]\n[[category:維基站務模板|C]]\n</noinclude>" % len(pages)
comment= u"Bot更新: 当前有 %s 项专题讨论" % len(pages)
wikipedia.output('[[Template:CurrentDiscussion]] update: Current %s Discussion' % len(pages))
t.put(c, comment, minorEdit=False)
#标注专题讨论
def Indiscussion(t, title,talk,days):
s=wikipedia.Page(site,title)
try:
text=s.get()
re.search('=* *'+talk+' *=*' + ' *\n*' + '\{\{indiscussion(|)(.*?)\}\}', text, re.I).group(0)
i='yes'
wikipedia.output(u'"{{Indiscussion}}" fond in [[%s]]' % title)
except:
i=None
return
return i
#标记特色与优良条目(特色列表)
def FGflag(title,FAcats, GAcats, FALcats):
""" wikipedia.getCategoryLinks()暂时不工作,只得到空的分类列表。"""
# plist = pg.categories()
# templatelist=pg.templates()
# print templatelist
if wikipedia.Page(site, title) in FAcats or wikipedia.Page(site, title) in FALcats:
# if (templatelist.count(u'特色条目')<>0 or templatelist.count(u'特色列表')) <>0 and templatelist.count('GA')==0:
fg='FA'
# elif (templatelist.count(u'特色条目')==0 or templatelist.count(u'特色列表') ==0) and templatelist.count('GA')<>0:
elif wikipedia.Page(site, title) in GAcats:
fg='GA'
# elif templatelist.count(u'特色条目')<>0 and templatelist.count('GA')<>0:
# fg='ERROR'
# wikipedia.output(u'ERROR: Plese fix FA or GA!')
else:
fg=""
return fg
def nontalk(title):
"""对话页所对应的主名字空间状态,包括主名字空间是否存在、是否消歧义、是否重定向、是否空条目(人为破坏)
是否被保护、图像是否在commons。以及判断对话页是否被保护
"""
pg=wikipedia.Page(site,title)
nontalkpage=pg.toggleTalkPage()
stuts={u'nomain':'',u'disambig':'',u'redirect':'',u'empty':'',u'protect':'',u'nomain':'',u'nowcommons':'',u'talkprotect':''}
if nontalkpage.exists():
stuts[u'nomain']=''
if nontalkpage.isDisambig():
stuts[u'disambig']='yes'
wikipedia.output(u'[[%s]] is Disambig page!' % nontalkpage.title())
else:
stuts[u'disambig']=''
if nontalkpage.isRedirectPage():
stuts[u'redirect']='yes'
wikipedia.output(u'[[%s]] is Redirect Page!' % nontalkpage.title())
else:
stuts[u'redirect']=''
if nontalkpage.isEmpty() and not nontalkpage.isRedirectPage():
stuts[u'empty']='yes'
wikipedia.output(u'WARING: [[%s]] is Empty!!!' % nontalkpage.title())
else:
stuts[u'empty']=''
if nontalkpage.canBeEdited():
stuts[u'protect']=''
else:
stuts[u'protect']='yes'
wikipedia.output(u'[[%s]] is protected!!!' % nontalkpage.title())
else:
stuts[u'nomain']='none'
wikipedia.output(u'[[%s]] is Not exist!!!' % nontalkpage.title())
if nontalkpage.isImage():
imagepage=wikipedia.ImagePage(site, nontalkpage.title())
if imagepage.fileIsOnCommons():
stuts[u'nowcommons']='yes'
else:
stuts[u'nowcommons']=''
templatelist=nontalkpage.templates()
if templatelist.count(u'FeaturedPicture') <>0:
fg='FA'
else:
fg=''
if pg.canBeEdited():
stuts[u'talkprotect']=''
else:
stuts[u'talkprotect']='yes'
wikipedia.output(u'[[%s]] is Protected!!!' % title)
return stuts
def start(name, attrs):
temp=''''''
ns = ''
title = ''''''
user = ''''''
timestamp=''
talk=''''''
global n
if name == 'rc':
try:
title=attrs[u'''title''']
user=attrs[u'''user''']
talk=attrs[u'''comment''']
timestamp=attrs[u'''timestamp''']
ns=attrs[u'''ns''']
try:
#排除重复项目
talk=re.search(talkformat, talk).groups()[0]
t=title+'#'+talk
tu=t+user
if (titleList.count(title)==0 and talkList.count(talk)>0) or (titleList.count(title)>0 and talkList.count(talk)==0) or (titleList.count(title)==0 and talkList.count(talk)==0):
if wikipedia.Page(site,title).isRedirectPage():
wikipedia.output(u'%s is Redirect Page.' % title)
else:
titleList.append(title)
talkList.append(talk)
# print talkList, titleList
# 扩展部分,标注某些项目 ------------------------------------------------------
hot=bighot(t, title)
stuts=nontalk(title)
if ns=='1':
fg=FGflag(title,FAcats, GAcats, FALcats)
else:
fg=''
i=Indiscussion(t,title,talk,days)
badpage=wikipedia.Page(site,u'User:Talkindexbot/blist')
if wikipedia.Page(site,title) not in badpage.linkedPages():
IndiscussionAuto(t, title, talk, days, i)
else:
wikipedia.output(u'%s in black list' % title)
# print stuts
temp = template % (title, user, talk, timestamp, i, hot, fg, stuts[u'nomain'], stuts[u'disambig'], stuts[u'redirect'],stuts[u'empty'], stuts[u'protect'], stuts[u'nowcommons'], stuts[u'talkprotect'])
# print 'HI', temp
tempList.append(temp)
n=n+1
except AttributeError:
return
except KeyError:
return
else:
return
#得到xml数据
def Parsexml(html, start_element):
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = start_element
p.returns_unicode = True
try:
p.Parse(html)
except xml.parsers.expat.ExpatError:
return
def run():
"""分析页面,提取所有有用信息"""
while True:
#格式化日期
rcstart = time.strftime("%Y%m%d%H%M%S",time.gmtime())
y=time.strftime("%Y",time.gmtime())
m=time.strftime("%m",time.gmtime())
d=time.strftime("%d",time.gmtime())
h=time.strftime("%H",time.gmtime())
min=time.strftime("%M",time.gmtime())
s=time.strftime("%S",time.gmtime())
end=datetime.datetime(int(y),int(m),int(d),int(h),int(min),int(s))
end = end - datetime.timedelta(days=days)
rcend = end.strftime("%Y%m%d%H%M%S")
#根据名字空间建立不同页面
ns={'1':u'Wikipedia:对话页讨论索引/条目','5':u'Wikipedia:对话页讨论索引/wikipedia','7':u'Wikipedia:对话页讨论索引/图像','9':u'Wikipedia:对话页讨论索引/mediawiki','11':u'Wikipedia:对话页讨论索引/模板','13':u'Wikipedia:对话页讨论索引/帮助','15':u'Wikipedia:对话页讨论索引/分类','101':u'Wikipedia:对话页讨论索引/主题'}
for rcns, wiki in ns.items():
html = GetPage(rcstart,rcend, rcns)
Parsexml(html, listelement)
Parsexml(html, start)
global n, tempList
if rcns== '1':
basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科条目对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在条目对话页上的讨论 ==\n''' % days
elif rcns == '5':
basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科Wikipedia(项目、方针等页面)对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在Wikipedia对话页上的讨论 ==\n''' % days
elif rcns == '7':
basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科图像对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在图像对话页上的讨论 ==\n''' % days
elif rcns == '9':
basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科Mediawiki(系统界面)对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在Mediawiki对话页上的讨论 ==\n''' % days
elif rcns == '11':
basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科模板对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在模板对话页上的讨论 ==\n''' % days
elif rcns == '13':
basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科帮助对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在帮助对话页上的讨论 ==\n''' % days
elif rcns == '15':
basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科分类对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在分类对话页上的讨论 ==\n''' % days
elif rcns == '101':
basewiki=u'''<noinclude>\n{{talkindex panel}}\n本页是中文维基百科主题(Portal)对话页上活跃讨论的索引。由[[user:Talkindexbot|]]定期更新。\n</noinclude>\n\n== 最近%s天在主题对话页上的讨论 ==\n''' % days
wikipedia.output(u'Namespace: %s: Total %s talk in %s days' % (rcns, n, days))
Lists = "".join(tempList)
tempList=[]
if Lists == "":
basewiki=basewiki+u'当前没有活跃的讨论。'
comment=u"Bot更新讨论索引:最近%s天,Namespace %s 上没有活跃的讨论" % (days, rcns)
else:
basewiki=basewiki+(u'当前共有%s项讨论。最后更新于~~~~~\n' % n)+ Lists
comment=u"Bot更新讨论索引:最近%s天内,Namespace %s 上共有%s项讨论" % (days, rcns, n)
pg=wikipedia.Page(site,wiki)
pg.put(basewiki, comment, minorEdit=False)
n=0
CurrentDiscussion()
hours=4
now = time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime())
wikipedia.output(u'\nDone.')
wikipedia.stopme()
print '\nSleeping %s hours, now %s' % (hours, now)
time.sleep(hours *60 *60)
#X天范围内的讨论索引
days=7
#run
try:
run()
finally:
wikipedia.stopme()