parse.py 9.02 KB
#!/usr/bin/env python3

import sys
import argparse
import datetime
import urllib.request
import dateutil.parser
from icalendar import Calendar
from html.parser import HTMLParser
from icalendar import vDatetime, Calendar, Event as CalEvent

# Parse command line arguments
parser = argparse.ArgumentParser(description='Convertit l\'emploi du temps IMA en ICS')
parser.add_argument('annee', metavar='ANNEE', type=int, help='année (3 ou 4)')
parser.add_argument('edt', metavar='EDT', type=str, help='la page pointant vers l\'emploi du temps concerné')
parser.add_argument('-o', '--output', dest='file', type=str, default='-', help='fichier de sortie, - pour stdout')
args = parser.parse_args()

if args.annee == 3:
    url = 'http://dptima3.polytech-lille.net/' + args.edt + '.html'
    SLOTS = [(( 8,  0), (10,  0)),
             ((10, 20), (12, 20)),
             ((13, 50), (15, 50)),
             ((16, 10), (18, 10))]
    DATE_FORMAT = '%d/%m/%Y'
elif args.annee == 4:
    url = 'http://www.lifl.fr/~forget/EDT/' + args.edt + '.html'
    SLOTS = [(( 8,  0), ( 9,  0)),
             (( 9, 10), (10, 10)),
             ((10, 20), (11, 20)),
             ((11, 30), (12, 30)),
             ((13, 50), (14, 50)),
             ((15, 00), (16, 00)),
             ((16, 10), (17, 10)),
             ((17, 20), (18, 20))]
    DATE_FORMAT = '%d/%m/%y'
else:
    raise ValueError('Année inconnue : ' + annee)

DAYS_PER_WEEK = 5.5

TABLE_1_DATE_X = 1
TABLE_1_FIRST_SLOT_X = 2

TABLE_2_DATE_X = 0
TABLE_2_FIRST_SLOT_X = 1

class TableHTMLParser(HTMLParser):
    tables = [] # Tables
    table = False # Current table content
    line = False # Current line content
    cell = False # Current cell content
    cellx = 1
    celly = 1

    # Logic
    def iscell(self):
        """
        Return if we are currently in a cell
        """
        return isinstance(self.cell, str)

    def isline(self):
        """
        Return if we are currently in a line
        """
        return isinstance(self.line, list)

    def istable(self):
        """
        Return if we are currently in a table
        """
        return isinstance(self.table, list)

    # Actions
    def endcell(self):
        if self.iscell():
            self.line.append((self.cell.strip(), self.cellx, self.celly))
            self.cell = False

    def endline(self):
        self.endcell()
        if self.isline():
            self.table.append(self.line.copy())
            self.line = False

    def endtable(self):
        self.endline()
        if self.istable():
            self.tables.append(self.table.copy())
            self.table = False

    # Inheritance
    def handle_starttag(self, tag, attrs):
        if tag == 'table':
            self.table = []
        elif tag == 'tr':
            self.endline()
            self.line = []
        elif tag == 'td':
            self.endcell()
            self.cell = ''
            self.cellx = 1
            self.celly = 1
            for attr in attrs:
                if attr[0] == 'colspan':
                    self.cellx = int(attr[1])
                elif attr[0] == 'rowspan':
                    self.celly = int(attr[1])

    def handle_endtag(self, tag):
        if tag == 'table':
            self.endtable()
        elif tag == 'tr':
            self.endline()
        elif tag == 'td':
            self.endcell()

    def handle_data(self, data):
        if self.iscell():
            self.cell += data

# TODO Do something that really is OOP or do not...

class Event:
    # Mined data
    shortText = ''
    longText = ''
    date = False
    begSlot = 0
    endSlot = 0

    # Generated data
    shortName = ''
    longName = ''
    location = ''
    startTime = False
    endTime = False
    active = False

    def feedShortText(self, shortText):
        self.shortText = shortText

    def feedLongText(self, longText):
        self.longText = longText

    def feedBegSlot(self, slot):
        self.begSlot = slot

    def feedEndSlot(self, slot):
        self.endSlot = slot

    def feedDate(self, date):
        self.date = date

    def endFeed(self):
        self.shortName = self.shortText
        self.longName = self.longText

        if self.shortName:
            self.active = True

        if self.date and isinstance(self.begSlot, int) and isinstance(self.endSlot, int):
            h, m = SLOTS[self.begSlot][0]
            self.startTime = self.date + datetime.timedelta(hours=h, minutes=m)
            h, m = SLOTS[self.endSlot][1]
            self.endTime = self.date + datetime.timedelta(hours=h, minutes=m)

        if self.longName:
            e = self.longName.split('(')
            if len(e) >= 2:
                f = e[1].split(')')
                self.longName = e[0].strip()
                self.location = f[0].strip()


    def __str__(self):
        if self.active:
            return self.shortName + ' [' + self.longName + '] ' + (str(self.startTime) + ' - ' + (str(self.endTime) + ' ') if self.startTime else '') + (('@ ' + self.location) if self.location else '')
        else:
            return 'Inactive event'

    def getEvent(self):
        e = CalEvent()
        e.add('uid', '-'.join([
            'polytech',
            'ima' + str(args.annee),
            args.edt,
            vDatetime(self.startTime).to_ical().decode(),
            vDatetime(self.endTime).to_ical().decode(),
            self.shortName
        ]))
        e.add('summary', self.shortName)
        e.add('description', self.longName)
        e.add('dtstart', self.startTime)
        e.add('dtend', self.endTime)
        e.add('location', self.location)
        e.add('last-modified', updateTime)
        e.add('dtstamp', updateTime)
        return e

with urllib.request.urlopen(url) as handle:
    updateTime = dateutil.parser.parse(handle.headers['Last-Modified'])
    htmlStr = handle.read().decode('iso-8859-15')

# Read HTML tables
parser = TableHTMLParser()
parser.feed(htmlStr)

# Dupplicates cells with colspan & rowspan
tables = []

for parserTable in parser.tables:
    # Figuring out dimensions
    X, Y = 0, 0
    for cell in parserTable[0]:
        X += cell[1]
    for line in parserTable:
        Y += line[0][2]

    # Constructing table with reals dimensions
    table = []
    for y in range(Y):
        line = []
        for x in range(X):
            line.append(False)
        table.append(line)

    # Filling table with parsed table
    x, y = 0, 0
    for line in parserTable:
        for cell in line:
            # Offsetting to the right if cell is not empty
            while isinstance(table[y][x], str):
                x += 1

            # Copying values
            for y2 in range(y, y + cell[2]):
                for x2 in range(x, x + cell[1]):
                    table[y2][x2] = cell[0]
        x = 0
        y += 1

    tables.append(table)

# Creating events
days = dict()

# Parsing table 1
for line in tables[0]:
    try:
        date = datetime.datetime.strptime(line[TABLE_1_DATE_X], DATE_FORMAT)
    except (ValueError, TypeError):
        # This is not a date, no data to grab here
        continue

    for weekSlot in range(int(DAYS_PER_WEEK * len(SLOTS))):
        daySlot = weekSlot % len(SLOTS)

        # New day
        if daySlot == 0:
            if weekSlot != 0:
                date = date + datetime.timedelta(days=1)

            if date not in days:
                days[date] = [Event() for s in range(len(SLOTS))]

        cell = line[TABLE_1_FIRST_SLOT_X + weekSlot]
        days[date][daySlot].feedShortText(cell)

# Parsing table 2
for line in tables[1]:
    try:
        date = datetime.datetime.strptime(line[TABLE_2_DATE_X], DATE_FORMAT)
    except ValueError:
        # This is not a date, no data to grab here
        continue

    if date not in days:
        days[date] = [Event() for s in range(len(SLOTS))]

    for slot in range(len(SLOTS)):
        days[date][slot].feedLongText(line[slot + TABLE_2_FIRST_SLOT_X])

# Feeding back time and slot to events
events = []
for day in days:
    prevEvent = False
    for slot in range(len(SLOTS)):
        event = days[day][slot]
        sameAsPrevious = False
        if prevEvent:
            sameAsPrevious = event.longText == prevEvent.longText if prevEvent.longText else event.shortText == prevEvent.shortText
            if sameAsPrevious:
                prevEvent.feedEndSlot(slot)
            else:
                prevEvent.endFeed()
                events.append(prevEvent)
        if not prevEvent or (prevEvent and not sameAsPrevious):
            event.feedDate(day)
            event.feedBegSlot(slot)
            event.feedEndSlot(slot)
            prevEvent = event
    prevEvent.endFeed()
    events.append(prevEvent)

# Creating calendar
cal = Calendar()
cal.add('proid', '-//geoffrey.frogeye.fr//NONSGML Icalendar Calendar//EN')
cal.add('version', '2.0')
cal.add('calscale', 'GREGORIAN')
cal.add('x-wr-calname', 'Polytech IMA ' + str(args.annee) + ' ' + args.edt)

for event in events:
    if event.active:
        cal.add_component(event.getEvent())

# Writing calendar to file
data = cal.to_ical()
if args.file == '-':
    sys.stdout.write(data.decode('utf-8'))
else:
    with open(args.file, 'wb') as f:
        f.write(data)