Viquipèdia:Viquitrobada 2009/Taller de bots/traductor gencat

De la Viquipèdia, l'enciclopèdia lliure
# -*- coding: utf-8 -*-
#!/usr/bin/env python
#[[w:ca:Viquipèdia:Viquitrobada 2009/Taller de bots/traductor gencat]]

import codecs, httplib, json, re, time, urllib, urllib2
from time import strftime as date
import wikipedia as pywikibot

#variables for browser
user_agent=u'Mozilla/5.0 (cawiki autotranslator 1.0)'
cache_control=u'no-cache'
pragma=u'no-cache'
accept_charset='utf-8'
encoding='utf-8'

#variables for translator
oc_dirs = {
    "ca-oc": {"dir": "ca-oc", "code": "ca"},
    "ca-aran": {"dir": "ca-oc_aran", "code": "ca"},
    "es-oc": {"dir": "es-oc", "code": "es"},
    "es-aran": {"dir": "es-oc_aran", "code": "es"},
    "oc-ca": {"dir": "oc-ca", "code": "oc"},
    "aran-es": {"dir": "oc_aran-ca", "code": "oc"},
    "oc-es": {"dir": "oc-es", "code": "oc"},
    "aran-es": {"dir": "oc-es_aran", "code": "oc"}
}
not_oc_dirs = {
    "en-ca": {"dir": "ENGLISH-CATALAN", "code": "en"},
    "fr-ca": {"dir": "FRENCH-CATALAN", "code": "fr"},
    "de-ca": {"dir": "GERMAN-CATALAN", "code": "de"},
    "es-ca": {"dir": "SPANISH-CATALAN", "code": "es"},
}

def get(title, dir, path, action=""):
    dirs = oc_dirs[dir]
    url_title = urllib.quote(title)
    target_url = "http://%(code)s.wikipedia.org/%(path)s%(title)s%(axn)s" % {"code": dirs["code"], "title": url_title, "axn": action, "path": path}
    url = "http://traductor.gencat.cat/jsp/content.jsp?dir=%(dir)s&marca=false&url=%(url)s" % {"url": target_url, "dir": dirs["dir"]}
    print url
    data=None
    retry=0
    while data is None:
        try:
            request=urllib2.Request(url)
            request.add_header('User-Agent', user_agent)
            request.add_header('Cache-Control', cache_control)
            request.add_header('Pragma', pragma)
            request.add_header('Accept-Charset', accept_charset)
            response=urllib2.urlopen(request)
            data=response.read()
            response.close()
        except KeyboardInterrupt:
            print u"l'usuari ha cancel·lat"
            data=""
            break
        except Exception, e:
            print e, retry
            retry+=1
            if retry>=5:
                data=""
                break
    return data

def post(title, dir):
    dirs = not_oc_dirs[dir]
    url    = u"http://%s.wikipedia.org/wiki/%s?action=edit" % (dirs['code'], urllib.quote(title))
    params ={
        "translationDirection": dirs['dir'],
        "subjectArea": "GV",
        "MARK_ALTERNATIVES": "0",
        "url": url
    }
    params = urllib.urlencode(params)
    headers= {
        'User-Agent': user_agent,
        "Content-type": "application/x-www-form-urlencoded",
        "Accept": "text/plain",
    }
    data=None
    retry=0
    while data is None:
        try:
            conn = httplib.HTTPConnection('traductor.gencat.net')
            conn.request("POST", "/url.do", params, headers)
            response = conn.getresponse()
            data = response.read()
        except KeyboardInterrupt:
            print u"l'usuari ha cancel·lat"
            data=""
            break
        except Exception, e:
            print e, retry
            retry+=1
            if retry>=5:
                data = ""
                break
    return data

def main():
    args = pywikibot.handleArgs()
    dir = None; title = None; path="wiki/"; action = ""
    #parsing parameters from command line.
    for arg in args:
        if arg[0] != "-": continue
        key = arg[1:].split(":", 1)[0]
        value = arg[1:].split(":", 1)[1] if ":" in arg else ""
        if key in ("t", u"títol", "title"):
            title = value
        elif key in ("l", u"llengües", "langs", "languages"):
            dir = value
        elif key == "a":
            if value =="j": action = "&action=raw&ctype=text/javascript"
            elif value == "e": action = "&action=edit"
            else: action = ""
    if path == "wiki/":
        action = action.replace("&","?",1)
            
    #fixing wrong or empty parameters values.
    if not title:
        title = pywikibot.input(u"pàgina:") or u"Esperanto"
    if dir and dir not in not_oc_dirs.keys() + oc_dirs.keys():
        dir = pywikibot.input(u"llengües:") or "en-ca"
    elif not dir:
        dir = pywikibot.input(u"llengües:") or "en-ca"
        
    #let's run!
    if dir in not_oc_dirs:
        txt = post(title, dir)
        textarea = re.compile(ur"<textarea[^>]*?>([^<]*?)</textarea>", re.MULTILINE)
        txt = textarea.sub(ur"\1", txt)
    else:
        txt = get(title, dir, path, action)
        txt = re.sub(ur"(<span style='color:red'>)([^<]*?)(</span>)", r"\2", txt)
    if txt:
        txt = txt.decode("utf-8")
        txt = pywikibot.html2unicode(txt)
        filename = u"psq-Gomà-trad-%s-(%s)-%s.txt" % (date("%y%m%d%H%M"), dir, title)
        f = codecs.open(filename, "w", 'utf-8')
        f.write(txt)
        f.close()

if __name__ == "__main__":
    #Samples to type in command line: (for Catalan to Occitan).
    # >traductor.py l:ca-oc t:Esperanto
    # >traductor.py l:ca-oc t:Esperanto -a:e
    # >traductor.py l:ca-oc t:Esperanto -a:j
    # >traductor.py l:ca-oc t:Esperanto -p:w
    # >traductor.py l:ca-oc t:Esperanto -p:w -a:e
    # >traductor.py l:ca-oc t:Esperanto -p:w -a:j
    main()