Usuari:TronaBot/Python/absents.py
< Usuari:TronaBot | Python
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
# Copyleft (!C) 2014 Coet
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse, codecs as cs, os, random, re, sys, time
sys.path.append("/home/pasqual/public_html/pyuserlib/")
from func import format_bytes, sort_list, Chrono
import wikipedia as wikibot, query as api
def read_file(filename):
"""Retorna el contingut d'un fitxer,
o una cadena buida si aquest no existeix."""
filename = "logs/%s.log" % filename
if not os.path.exists(filename): return ""
f = cs.open(filename, "r", "utf8")
txt = f.read()
f.close()
return txt
def get_page(title):
return wikibot.Page(get_site("en"), title)
def API(params, site):
return api.GetData(params, site = site)
def get_properties(article):
"""
Retorna les propietats de l'article. Volem saber si existeix
i en quantes llengües apareix. També ens proporciona la mida de l'article.
"""
params = {
"action": "query",
"prop": ["langlinks", "revisions"],
"lllimit": "max",
"rvprop": "size",
"rvlimit": 1,
"titles": article.title(),
"indexpageids": ""
}
q = API(params, article.site())
langs = []
pageid = q['query']['pageids'][0]
if q['query']['pages'][pageid].has_key("missing"):
return langs, 0 #redirect pages
if not q['query']['pages'][pageid].has_key("langlinks"):
return langs, 0
for iw in q['query']['pages'][pageid]['langlinks']:
langs.append(iw['lang'])
return langs, q['query']['pages'][pageid]['revisions'][0]['size']
def get_absents(limit=None, langs=None):
"""
Obtenim la llista d'articles que no tenim però que hi són a més de 20 altres viquis
limit: limita el nombre d'articles. Ha de ser un nombre enter.
langs: llista de llengües on cercar. Ha de ser una llista amb els codis corresponents.
"""
limit = limit and " LIMIT %i" % limit or ""
langs = langs or ["en", "es", "de", "it", "ja", "fr", "gl", "eu", "pt"]
f = "/home/pasqual/public_html/pywikilab/logs/absents_{lg}wiki.log"
for lang in langs:
f = os.path.join("logs", "absents_%swiki.log" % lang)
data = {"fi": f, "lg":lang, "li":limit, "iwmin": args.iwmin-1}
sql = "SELECT COUNT(ll_lang) AS ll_count, CONCAT('[[:{lg}:', page_title, ']]') AS page_wlink, page_len " \
"FROM langlinks LEFT JOIN page ON page_id = ll_from " \
"WHERE page_namespace = 0 " \
"GROUP BY ll_from " \
"HAVING MAX(ll_lang = 'ca') = 0 AND COUNT(ll_lang) > {iwmin} " \
"ORDER BY ll_count DESC, page_len DESC, page_title ASC{li};".format(**data)
data['sql']=sql
call = 'mysql -h {lg}wiki-p.db.toolserver.org -e "USE {lg}wiki_p; {sql}" > {fi}'.format(**data)
c.show()
print "calling {lg}wiki_p db".format(**data)
os.system(call)
def list_absents():
u"""Enllistem articles amb nombre de traduccions, títol i mida."""
for line in read_file("absents_cleaned").splitlines():
yield line.split("\t")
def get_pages():
u"""Creem un diccionari amb les pàgines i el seu nombre de traduccions i mida."""
pages={}
for counter, page, size in list_absents():
if not counter.isdigit(): continue
counter = int(counter)
#size = convert_bytes(size)
page = re.search("\[\[:[^:]+:([^\]]+)\]\]", page).group(1)
pages.update({page: (counter, size)})
return pages
def get_published_articles():
"""Obtenim la llista d'articles publicats."""
if not read_file("absents_published"): return []
return re.findall("\[\[:en:([^\]]+?)\]\]", read_file("absents_published"))
def fix_disamb(skip=False):
u"""netegem els resultats traent les pàgines de desambiguació.
El procés dura unes 4 hores per a enwiki. S'haurien d'eliminar en la consulta SQL.
"""
pages=[]
skip = get_published_articles() if skip else []
s=0;d=0;f=0
for counter, page, size in list_absents():
if not counter.isdigit(): continue
counter = int(counter)
conv_size = convert_bytes(size)
page = re.search("\[\[:[^:]+:([^\]]+)\]\]", page).group(1).replace("_"," ")
en_page=get_page(page)
if not en_page.exists():
continue
if en_page.isDisambig():
wikibot.output(u"skip disamb %s" % page)
d+=1
continue
if page in skip:
wikibot.output(u"skip %s" % page)
s+=1
continue
api_iws, api_size = get_properties(en_page)
if len(api_iws) < args.iwmin and "ca" in api_iws:
wikibot.output(u"skip false %s" % page)
f+=1
continue
if len(pages)%1000==0:
wikibot.output(u"%i %s [d: %i, s: %i, f: %i]" % (len(pages), page, d, s, f))
c.show()
pages.append((counter, page, size))
wf = cs.open("logs/absents_cleaned.log", "w", "utf8")
i=0
for items in pages:
i+=1
wf.write(u"%i\t[[:en:%s]]\t%s\r\n" % items)
wf.close()
wikibot.output(u"[t: %i, d: %i, s: %i, f: %i]" % (i, d, s, f))
def save(page, new_content):
u"""Arxiva els contingut de les pàgines abans d'inserir el nou contingut."""
archive = wikibot.Page(page.site(), u"%s/arxiu" % page.title())
old_content = page.get()
archive.put(old_content, "Bot: arxivant")
page.put(new_content, "Bot: actualitzant", force=True)
def random_lists():
u"""Creem 10 llistes amb 100 articles cadascuna agafant aleatòriament
articles de la llista principal, sempre que no hagen estat ja publicats."""
pages=get_pages()
skip=get_published_articles()
for page in pages.keys():
#eliminem articles ja publicats
if page in skip:
pages.pop(page)
continue
#eliminem articles amb menys iws del que volem
if pages[page][0]<args.iwmin: pages.pop(page)
i=0
print "len pages.keys()", len(pages.keys())
published = skip + []
while i<10:
i+=1
txt = ""
#get 100 articles
stack={}
while len(stack)<100:
if len(pages)==0:
break
page = random.choice(pages.keys())
values = pages.pop(page)
en_page=get_page(page)
if en_page.isDisambig(): continue
iws, size = get_properties(en_page)
if not (len(iws) >= args.iwmin and "ca" not in iws): continue
stack[page]=values
if len(stack)==0:
break
sorted_items = {}
for page in stack:
iws = stack[page][0]
size = stack[page][1]
if not sorted_items.has_key(iws):
sorted_items[iws]=[]
sorted_items[int(iws)].append((int(size),page))
for n in sorted_items:
sorted_items[n].sort()
sorted_items[n].reverse()
f = cs.open("logs/absents_random-%03i.log" % i, "w", "utf8")
line = u"""{{Avís|Signeu amb només <code><nowiki>~~~ </nowiki>""" \
u"""</code>, no cal la quarta. }}\n{| class="wikitable sortable"\n""" \
u"""! # iws!! Article en anglès !! Mida !! Usuari !! Observacions !! Article en català\n"""
txt+=line
f.write(line)
for n in reversed(sorted(sorted_items.keys())):
for item in sorted_items[n]:
title = item[1]
size = item[0]
published.append(title)
line = u"|-\n| %i || [[:en:%s]] || %s || || ||" % (
n,
title,
format_bytes(size)
)
#wikibot.output(line)
txt+= u"%s\r\n" % line
f.write(u"%s\r\n" % line)
line = "|}"
f.write(line)
f.close()
txt += line
print "r%03i" % i,
c.show()
if args.publish in ("all", "random"):
page = wikibot.Page(get_site("ca"), "%s/aleatoris/%03i" % (root, i))
save(page, txt)
f = cs.open("logs/absents_published.log", "w", "utf8")
f.write(u"[[:en:%s]]" % (u"]]\r\n[[:en:".join(sort_list(published))))
f.close()
def main_list():
u"""És la llista principal amb tots els articles preparada per a ser publicada."""
pages = get_pages()
pages_by_iwnumber = {}
for page in pages:
if not pages_by_iwnumber.has_key(pages[page][0]):
pages_by_iwnumber[pages[page][0]]=[]
pages_by_iwnumber[pages[page][0]].append((int(pages[page][1]),page))
for counter in pages_by_iwnumber.keys():
pages_by_iwnumber[counter].sort()
pages_by_iwnumber[counter].reverse()
f = cs.open("logs/absents_main.log", "w", "utf8")
for n in reversed(sorted(pages_by_iwnumber.keys())):
i=0; p=0
f.write(u"== %i interviquis (%i) ==\r\n" % (n, len(pages_by_iwnumber[n])))
for page in pages_by_iwnumber[n]:
i+=1
line=u"# [[:en:{page}]] {size}\r\n".format(
page = pages_by_iwnumber[n][p][1],
size = format_bytes(pages_by_iwnumber[n][p][0])
)
if len(pages_by_iwnumber[n]) > 50 and i % 25 == 0 and len(pages_by_iwnumber[n]) - i > 25:
line += u"#;%i interviquis\r\n" % n
f.write(line)
p+=1
f.write("\r\n")
f.close()
if args.publish in ("all", "main"):
page = wikibot.Page(get_site("ca"), "%s/general" % root)
save(page, txt)
def publish(log):
u"""Edita les llistes. S'utilitza si no s'ha fet servir abans"""
if log in ("all", "main"):
txt = read_file("absents_main")
page = wikibot.Page(get_site("ca"), "%s/general" % root)
save(page, txt)
if log in ("all", "random"):
for n in range(1,11):
txt = read_file("absents_random-%03i" % n)
page = wikibot.Page(get_site("ca"), "%s/aleatoris/%03i" % (root, n))
save(page, txt)
def backup():
u"""Reanomenem els fitxers per a evitar la sobreescritura"""
path = os.path.join(os.getcwd(),"logs")
s=time.strftime("backup%y%m%d%H%M%S")
for filename in os.listdir(path):
if filename.startswith("absents_random-"):
number = int(re.search("(\d+)", filename).group(1))
new_name = "absents_%s-random-%03i.log" % (s, number)
os.rename(os.path.join(path, filename), os.path.join(path, new_name))
elif filename.startswith("absents_published"):
os.rename(os.path.join(path, filename),os.path.join(path, "absents_%s-published.log" % s))
if __name__ == "__main__":
u"""
Primers passos:
* crear registre d'articles absents amb:
abstents.py -g
* crear llista principal amb:
abstents.py -m
En un sol pas:
(l'ordre dels arguments és important!)
abstents.py -g -m
Passos posteriors:
Si ja tenim la llista principal podem passar a publicar i crear llistes aleatòries:
abstents.py -r -p all
En un sol pas:
(l'ordre dels arguments és important!)
abstents.py -g -m -r -p all
"""
root = "Viquiprojecte:Articles absents"
c = Chrono()
c.start()
p = argparse.ArgumentParser()
p.add_argument("-g", dest="getabsents", action="store_true", default=False)
p.add_argument("-m", dest="mainlist", action="store_true", default=False)
p.add_argument("-r", dest="randomlists", action="store_true", default=False)
p.add_argument("-p", dest="publish", action="store", choices=["all","main","random"], default=None)
p.add_argument("-i", dest="iwmin", action="store", type=int, default=40)
p.add_argument("-b", dest="backup", action="store_true", default=False)
args = p.parse_args()
get_site = wikibot.getSite
if args.getabsents:
get_absents()
fix_disamb(skip=True)
if args.mainlist:
main_list()
if args.randomlists:
random_lists()
if args.publish and (not args.mainlist or not args.randomlists):
if args.publish in ("all", "main", "random"):
publish(args.publish)
if args.backup:
backup()
c.stop()
c.show()