Usuari:TronaBot/Python/absents.py

De la Viquipèdia, l'enciclopèdia lliure
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

# Copyleft (!C) 2014 Coet
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import argparse, codecs as cs, os, random, re, sys, time
sys.path.append("/home/pasqual/public_html/pyuserlib/")
from func import format_bytes, sort_list, Chrono
import wikipedia as wikibot, query as api

def read_file(filename):
	"""Retorna el contingut d'un fitxer,
	o una cadena buida si aquest no existeix."""

	filename = "logs/%s.log" % filename
	if not os.path.exists(filename): return ""
	f = cs.open(filename, "r", "utf8")
	txt = f.read()
	f.close()
	return txt

def get_page(title):
	return wikibot.Page(get_site("en"), title)

def API(params, site):
	return api.GetData(params, site = site)

def get_properties(article):
	"""
	Retorna les propietats de l'article. Volem saber si existeix
	i en quantes llengües apareix. També ens proporciona la mida de l'article.
	"""
	params = {
		"action": "query",
		"prop": ["langlinks", "revisions"],
		"lllimit": "max",
		"rvprop": "size",
		"rvlimit": 1,
		"titles": article.title(),
		"indexpageids": ""
	}
	q = API(params, article.site())
	langs = []
	pageid = q['query']['pageids'][0]
	if q['query']['pages'][pageid].has_key("missing"):
		return langs, 0 #redirect pages
	if not q['query']['pages'][pageid].has_key("langlinks"):
		return langs, 0
	for iw in q['query']['pages'][pageid]['langlinks']:
		langs.append(iw['lang'])
	return langs, q['query']['pages'][pageid]['revisions'][0]['size']

def get_absents(limit=None, langs=None):
	"""
	Obtenim la llista d'articles que no tenim però que hi són a més de 20 altres viquis
		limit: limita el nombre d'articles. Ha de ser un nombre enter.
		langs: llista de llengües on cercar. Ha de ser una llista amb els codis corresponents.
	"""
	limit = limit and " LIMIT %i" % limit or ""
	langs = langs or ["en", "es", "de", "it", "ja", "fr", "gl", "eu", "pt"]
	f = "/home/pasqual/public_html/pywikilab/logs/absents_{lg}wiki.log"
	for lang in langs:
		f = os.path.join("logs", "absents_%swiki.log" % lang)
		data = {"fi": f, "lg":lang, "li":limit, "iwmin": args.iwmin-1}
		sql = "SELECT COUNT(ll_lang) AS ll_count, CONCAT('[[:{lg}:', page_title, ']]') AS page_wlink, page_len " \
		"FROM langlinks LEFT JOIN page ON page_id = ll_from " \
		"WHERE page_namespace = 0 " \
		"GROUP BY ll_from " \
		"HAVING MAX(ll_lang = 'ca') = 0 AND COUNT(ll_lang) > {iwmin} " \
		"ORDER BY ll_count DESC, page_len DESC, page_title ASC{li};".format(**data)
		data['sql']=sql
		call = 'mysql -h {lg}wiki-p.db.toolserver.org -e "USE {lg}wiki_p; {sql}" > {fi}'.format(**data)
		c.show()
		print "calling {lg}wiki_p db".format(**data)
		os.system(call)

def list_absents():
	u"""Enllistem articles amb nombre de traduccions, títol i mida."""
	for line in read_file("absents_cleaned").splitlines():
		yield line.split("\t")

def get_pages():
	u"""Creem un diccionari amb les pàgines i el seu nombre de traduccions i mida."""
	pages={}
	for counter, page, size in list_absents():
		if not counter.isdigit(): continue
		counter = int(counter)
		#size = convert_bytes(size)
		page = re.search("\[\[:[^:]+:([^\]]+)\]\]", page).group(1)
		pages.update({page: (counter, size)})
	return pages

def get_published_articles():
	"""Obtenim la llista d'articles publicats."""
	if not read_file("absents_published"): return []
	return re.findall("\[\[:en:([^\]]+?)\]\]", read_file("absents_published"))

def fix_disamb(skip=False):
	u"""netegem els resultats traent les pàgines de desambiguació.
	El procés dura unes 4 hores per a enwiki. S'haurien d'eliminar en la consulta SQL.
	"""

	pages=[]
	skip = get_published_articles() if skip else []
	s=0;d=0;f=0
	for counter, page, size in list_absents():
		if not counter.isdigit(): continue
		counter = int(counter)
		conv_size = convert_bytes(size)
		page = re.search("\[\[:[^:]+:([^\]]+)\]\]", page).group(1).replace("_"," ")
		en_page=get_page(page)
		if not en_page.exists():
			continue
		if en_page.isDisambig():
			wikibot.output(u"skip disamb %s" % page)
			d+=1
			continue
		if page in skip:
			wikibot.output(u"skip %s" % page)
			s+=1
			continue
		api_iws, api_size = get_properties(en_page)
		if len(api_iws) < args.iwmin and  "ca" in api_iws:
			wikibot.output(u"skip false %s" % page)
			f+=1
			continue
		if len(pages)%1000==0:
			wikibot.output(u"%i %s [d: %i, s: %i, f: %i]" % (len(pages), page, d, s, f))
			c.show()
		pages.append((counter, page, size))
	wf = cs.open("logs/absents_cleaned.log", "w", "utf8")
	i=0
	for items in pages:
		i+=1
		wf.write(u"%i\t[[:en:%s]]\t%s\r\n" % items)
	wf.close()
	wikibot.output(u"[t: %i, d: %i, s: %i, f: %i]" % (i, d, s, f))

def save(page, new_content):
	u"""Arxiva els contingut de les pàgines abans d'inserir el nou contingut."""
	archive = wikibot.Page(page.site(), u"%s/arxiu" % page.title())
	old_content = page.get()
	archive.put(old_content, "Bot: arxivant")
	page.put(new_content, "Bot: actualitzant", force=True)

def random_lists():
	u"""Creem 10 llistes amb 100 articles cadascuna agafant aleatòriament
	articles de la llista principal, sempre que no hagen estat ja publicats."""

	pages=get_pages()
	skip=get_published_articles()
	for page in pages.keys():
		#eliminem articles ja publicats
		if page in skip:
			pages.pop(page)
			continue
		#eliminem articles amb menys iws del que volem
		if pages[page][0]<args.iwmin: pages.pop(page)

	i=0
	print "len pages.keys()", len(pages.keys())
	published = skip + []
	while i<10:
		i+=1
		txt = ""
		#get 100 articles
		stack={}
		while len(stack)<100:
			if len(pages)==0:
				break
			page = random.choice(pages.keys())
			values = pages.pop(page)
			en_page=get_page(page)
			if en_page.isDisambig(): continue
			iws, size = get_properties(en_page)
			if not (len(iws) >= args.iwmin and "ca" not in iws): continue
			stack[page]=values
		if len(stack)==0:
			break
		sorted_items = {}
		for page in stack:
			iws = stack[page][0]
			size = stack[page][1]
			if not sorted_items.has_key(iws):
				sorted_items[iws]=[]
			sorted_items[int(iws)].append((int(size),page))
		for n in sorted_items:
			sorted_items[n].sort()
			sorted_items[n].reverse()

		f = cs.open("logs/absents_random-%03i.log" % i, "w", "utf8")
		line = u"""{{Avís|Signeu amb només <code><nowiki>~~~ </nowiki>""" \
		u"""</code>, no cal la quarta. }}\n{| class="wikitable sortable"\n""" \
		u"""! # iws!! Article en anglès !! Mida !! Usuari !! Observacions !! Article en català\n"""
		txt+=line
		f.write(line)
		for n in reversed(sorted(sorted_items.keys())):
			for item in sorted_items[n]:
				title = item[1]
				size = item[0]
				published.append(title)
				line = u"|-\n| %i || [[:en:%s]] || %s ||   ||   ||" % (
					n,
					title,
					format_bytes(size)
				)
				#wikibot.output(line)
				txt+= u"%s\r\n" % line
				f.write(u"%s\r\n" % line)
		line = "|}"
		f.write(line)
		f.close()
		txt += line
		print "r%03i" % i,
		c.show()
		if args.publish in ("all", "random"):
			page = wikibot.Page(get_site("ca"), "%s/aleatoris/%03i" % (root, i))
			save(page, txt)
	f = cs.open("logs/absents_published.log", "w", "utf8")
	f.write(u"[[:en:%s]]" % (u"]]\r\n[[:en:".join(sort_list(published))))
	f.close()

def main_list():
	u"""És la llista principal amb tots els articles preparada per a ser publicada."""
	pages = get_pages()
	pages_by_iwnumber = {}
	for page in pages:
		if not pages_by_iwnumber.has_key(pages[page][0]):
			pages_by_iwnumber[pages[page][0]]=[]
		pages_by_iwnumber[pages[page][0]].append((int(pages[page][1]),page))
	for counter in pages_by_iwnumber.keys():
		pages_by_iwnumber[counter].sort()
		pages_by_iwnumber[counter].reverse()
	f = cs.open("logs/absents_main.log", "w", "utf8")
	for n in reversed(sorted(pages_by_iwnumber.keys())):
		i=0; p=0
		f.write(u"== %i interviquis (%i) ==\r\n" % (n, len(pages_by_iwnumber[n])))
		for page in pages_by_iwnumber[n]:
			i+=1
			line=u"# [[:en:{page}]] {size}\r\n".format(
				page = pages_by_iwnumber[n][p][1],
				size = format_bytes(pages_by_iwnumber[n][p][0])
			)
			if len(pages_by_iwnumber[n]) > 50 and i % 25 == 0 and len(pages_by_iwnumber[n]) - i > 25:
				line += u"#;%i interviquis\r\n" % n
			f.write(line)
			p+=1
		f.write("\r\n")
	f.close()
	if args.publish in ("all", "main"):
		page = wikibot.Page(get_site("ca"), "%s/general" % root)
		save(page, txt)

def publish(log):
	u"""Edita les llistes. S'utilitza si no s'ha fet servir abans"""
	if log in ("all", "main"):
		txt = read_file("absents_main")
		page = wikibot.Page(get_site("ca"), "%s/general" % root)
		save(page, txt)
	if log in ("all", "random"):
		for n in range(1,11):
			txt = read_file("absents_random-%03i" % n)
			page = wikibot.Page(get_site("ca"), "%s/aleatoris/%03i" % (root, n))
			save(page, txt)

def backup():
	u"""Reanomenem els fitxers per a evitar la sobreescritura"""
	path = os.path.join(os.getcwd(),"logs")
	s=time.strftime("backup%y%m%d%H%M%S")
	for filename in os.listdir(path):
		if filename.startswith("absents_random-"):
			number = int(re.search("(\d+)", filename).group(1))
			new_name = "absents_%s-random-%03i.log" % (s, number)
			os.rename(os.path.join(path, filename), os.path.join(path, new_name))
		elif filename.startswith("absents_published"):
			os.rename(os.path.join(path, filename),os.path.join(path, "absents_%s-published.log" % s))

if __name__ == "__main__":
	u"""
	Primers passos:
	* crear registre d'articles absents amb:
		abstents.py -g
	* crear llista principal amb:
		abstents.py -m
	En un sol pas:
		(l'ordre dels arguments és important!)
		abstents.py -g -m

	Passos posteriors:
		Si ja tenim la llista principal podem passar a publicar i crear llistes aleatòries:
		abstents.py -r -p all
	En un sol pas:
		(l'ordre dels arguments és important!)
		abstents.py -g -m -r -p all
	"""
	root = "Viquiprojecte:Articles absents"
	c = Chrono()
	c.start()
	p = argparse.ArgumentParser()
	p.add_argument("-g", dest="getabsents", action="store_true", default=False)
	p.add_argument("-m", dest="mainlist", action="store_true", default=False)
	p.add_argument("-r", dest="randomlists", action="store_true", default=False)
	p.add_argument("-p", dest="publish", action="store", choices=["all","main","random"], default=None)
	p.add_argument("-i", dest="iwmin", action="store", type=int, default=40)
	p.add_argument("-b", dest="backup", action="store_true", default=False)
	args = p.parse_args()
	get_site = wikibot.getSite
	if args.getabsents:
		get_absents()
		fix_disamb(skip=True)
	if args.mainlist:
		main_list()
	if args.randomlists:
		random_lists()
	if args.publish and (not args.mainlist or not args.randomlists):
		if args.publish in ("all", "main", "random"):
			publish(args.publish)
	if args.backup:
		backup()

	c.stop()
	c.show()