#!/usr/bin/python3
# ==============================================================================================
# Page converter design
# ==============================================================================================
# nam   Name for that stream
# snd   Sound to be played
# url   URL of the stream
# div   xpath to get every division / record
# dat   where is the date  in this division
# tit   where is the title in this division
# inf   where is the info  in this division
# xml   True is XML
# ----------------------------------------------------------------------------------------------
nam_1 = "Reuters"
snd_1 = "ReutersFeed.wav"
url_1 = "http://www.reuters.com/"
div_1 = "//div[@class='news-headline-list medium']/article[starts-with(@class,'story')]"
key_1 = "./div[@class='story-content']/a/@href"
dat_1 = ".//span[@class='timestamp']"
tit_1 = ".//h3[@class='story-title']"
inf_1 = ".//p"
xml_1 = False
# ----------------------------------------------------------------------------------------------
nam_2 = "AFP"
snd_2 = "AFPFeed.wav"
url_2 = "https://www.afp.com/en/news-hub"
div_2 = "//div[starts-with(@id,'afp_news_')]/div[starts-with(@class,'item_title')]"
key_2 = ".//h4[starts-with(@class,'htitle')]/a/@href"
dat_2 = "./span"
tit_2 = "./h4[starts-with(@class,'htitle')]/a"
inf_2 = "./div/p"
xml_2 = False
# ----------------------------------------------------------------------------------------------
nam_3 = "@realDonaldTrump"
snd_3 = "TwitterFeed.wav"
url_3 = "https://mobile.twitter.com/realDonaldTrump"
div_3 = "//table[starts-with(@class,'tweet')]"
key_3 = "./tr[starts-with(@class,'tweet-header')]/td[@class='timestamp']/a/@name"
dat_3 = "./tr[starts-with(@class,'tweet-header')]/td[@class='timestamp']/a"
tit_3 = "./tr[@class='tweet-container']/td[@class='tweet-content']/div[@class='tweet-text']/div"
inf_3 = "./tr[starts-with(@class,'tweet-header')]/td[@class='user-info']/a/strong[@class='fullname']"
xml_3 = False
# ==============================================================================================
class WebPage:
    """ RTTY_WebPage.WebPage Class: Process a WEB page and returns a ready to print string"""
    nam = [nam_1, nam_2, nam_3]
    snd = [snd_1, snd_2, snd_3]
    url = [url_1, url_2, url_3]
    div = [div_1, div_2, div_3]
    dat = [dat_1, dat_2, dat_3]
    tit = [tit_1, tit_2, tit_3]
    inf = [inf_1, inf_2, inf_3]
    key = [key_1, key_2, key_3]
    xml = [xml_1, xml_2, xml_3]
    # ------------------------------------------------------------------------------------------
    def __init__(self, linemax=64, site=0, dbg=False, trk=False):
        import RTTY_Hash
        self._dbg     = dbg
        self._running = False
        self._abort   = False
        self._trk     = trk
        self._site    = site
        self._max     = len(self.nam)-1
        self._hsh     = RTTY_Hash.Hash()
        self._linemax = linemax
    # ------------------------------------------------------------------------------------------
    def __repr__(self):
        msg = "\n ".join(self.url[i] for i in range(len(self.nam)))
        return "len:{} sitemax:0..{}\n ".format(self._linemax, self._max)+msg
    # ------------------------------------------------------------------------------------------
    @property
    def dbg(self):
        return self._dbg
    @dbg.setter
    def dbg(self, x):
        self._dbg = x
    # ------------------------------------------------------------------------------------------
    @property
    def trk(self):
        return self._trk
    @trk.setter
    def trk(self, x):
        self._trk = x
    # ------------------------------------------------------------------------------------------
    @property
    def sitemax(self):
        return self._sitemax
    # ------------------------------------------------------------------------------------------
    @property
    def linemax(self):
        return self._linemax
    @dbg.setter
    def linemax(self, x):
        self._linemax = x
    # ------------------------------------------------------------------------------------------
    @property
    def site(self):
        return self._site
    @dbg.setter
    def site(self, x):
        self._site = x
    # ------------------------------------------------------------------------------------------
    def cleanline(self,line):
        if (line != ""):
            line = line.encode('ascii', 'ignore').decode()
            line = line.replace("\n", "")
            line = line.strip()
        return line
    # ------------------------------------------------------------------------------------------
    def Read(self):
        import re
        import sys
        import urllib
        from lxml import html
        from lxml import etree
        from lxml.html.clean import clean_html
        # --------------------------------------------------------------------------------------
        if (self._site >= len(self.nam)):
            msg = "Webpage reader error on index "+str(self._site)+" (max="+str(len(self.nam)-1)+")"
            raise ValueError(msg)
        # --------------------------------------------------------------------------------------
        text = ""
        self._running = True
        # --------------------------------------------------------------------------------------
        try:
            import urllib.request
            page = html.fromstring(urllib.request.urlopen(self.url[self._site]).read())
        except Exception as e:
            text="Error loading page"
            if (self._dbg): print("ERROR: PAGE LOAD ({0})".format(e))
            self._running = False
            return text
        if (self._abort):
            self._running = False
            return ""
        # --------------------------------------------------------------------------------------
        if (self._dbg): print("INFO:  PAGE START")
        try:
            if (not self.xml[self._site]):
                page = clean_html(page)
            # ----------------------------------------------------------------------------------
            for element in page.xpath(self.div[self._site]):
                #-------------------------------------------------------------------------------
                if (self._abort):
                    self._running = False
                    return ""
                #-------------------------------------------------------------------------------
                if (self._trk):
                    print("DIV: " + element.text_content())
                    print(etree.tostring(element, pretty_print = True))
                #-------------------------------------------------------------------------------
                # Extract uniq identifier and test for previous processing
                #-------------------------------------------------------------------------------
                ckey = ""
                first = False
                for key in element.xpath(self.key[self._site]):
                    try:					# Key can be a text
                        ckey = ckey+key.text_content()
                    except:					# Key can be an atrribute
                        ckey = ckey+key
                if (ckey == ""):
                    msg="Webpage reader error on key search ("+self.key[self._site]+")"
                    raise ValueError(msg)
                first = self._hsh.add2Dict(ckey)
                if (not first): continue
                #-------------------------------------------------------------------------------
                # Get title  
                #-------------------------------------------------------------------------------
                ctit = ""
                for title in element.xpath(self.tit[self._site]):
                    if (self._abort):
                        self._running = False
                        return ""
                    ctit = ctit+title.text_content()+" "
                    if (title.tail is not None): ctit=ctit+title.tail
                    if (ctit == ""):
                        msg = "Webpage reader error on title search ("+self.tit[self._site]+")"
                        raise ValueError(msg)
                    ctit = re.sub("(.{"+str(self._linemax)+"})","\\1\r\n",self.cleanline(ctit),0,re.DOTALL)
                    ctit = re.sub("\r\n\s+","\r\n",ctit)
                    if (self._trk):
                        print("TIT")
                        print(etree.tostring(title, pretty_print = True))
                #-------------------------------------------------------------------------------
                # Get Date
                #-------------------------------------------------------------------------------
                cdat = ""
                for date in element.xpath(self.dat[self._site]):
                    if (self._abort):
                        self._running = False
                        return ""
                    cdat = date.text_content()
                    if (date.tail is not None): cdat=cdat+date.tail
                    if (cdat == ""):
                        msg = "Webpage reader error on date search ("+self.dat[self._site]+")"
                        raise ValueError(msg)
                    cdat = self.cleanline(cdat)
                    if (self._trk):
                        print("DAT: ")
                        print(etree.tostring(date, pretty_print = True))
                #-------------------------------------------------------------------------------
                # Get information
                #-------------------------------------------------------------------------------
                cinf = ""
                for info in element.xpath(self.inf[self._site]):
                    if (self._abort):
                        self._running = False
                        return ""
                    cinf = info.text_content()
                    if (info.tail is not None): cinf = cinf+info.tail
                    if (cinf == ""):
                        msg = "Webpage reader error on info search ("+self.inf[self._site]+")"
                        raise ValueError(msg)
                    cinf = re.sub("(.{"+str(self._linemax)+"})","\\1\r\n", self.cleanline(cinf),0,re.DOTALL)
                    cinf = re.sub("\r\n\s+", "\r\n", cinf)
                    if (self._trk):
                        print("INF: ")
                        print(etree.tostring(infos, pretty_print = True))
                #-------------------------------------------------------------------------------
                # First seen: assemble items
                #-------------------------------------------------------------------------------
                text += '----\r\n'
                text += self.nam[self._site]+" - "+cdat
                text += '\r\n'
                text += ctit
                text += '\r\n'
                text += cinf
                text += '\r\n'
                # ------------------------------------------------------------------------------
            if (self._dbg):
                print(text)
        except Exception as e:
            if (self._dbg): print("ERROR: PAGE PROCESSING ({0})".format(e))
            self._running = False
            pass
        # --------------------------------------------------------------------------------------
        self._hsh.cleanDict()
        # --------------------------------------------------------------------------------------
        if (self._dbg): print("INFO:  PAGE END")
        self._running = False
        return text
    # ------------------------------------------------------------------------------------------
    def Abort(self):
       if (self._running):
            self._abort = True
    # ------------------------------------------------------------------------------------------
    def Test(self):
        print("TEST:  WEB PAGE EXTRACT")
        try:
            for i in range(len(self.nam)):
                self.site = i
                print(self.Read())
        except:
            print("-> Not passed")
            pass
# ==============================================================================================
if __name__ == '__main__':
    import sys
    import signal
    from inspect import getdoc
    def signal_term_handler(signal, frame):
        this.Abort()
        sys.exit(0)
    signal.signal(signal.SIGINT,  signal_term_handler)
    signal.signal(signal.SIGTERM, signal_term_handler)
    print(getdoc(WebPage))
    this = WebPage()
    this.dbg = True
#   this.trk = True
    this.Test()
# ==============================================================================================
