Thunderbird mbox to Influxdb and Postgresql to Grafana in Python

J’ai donc fait un programme en python afin de faire un export des données de Thunderbird ( fichier mbox contenant les emails) vers Influxdb et Postgresql (dans un premier temps).

Le but du programme est de maitriser le python afin de faire ensuite des exports vers Grafana. Sur mes exemples j’ai mis en login/password arias/arias (je sais c’est pas secure mais c’est un exemple …).

Configuration GRAFANA:

Le programme :

( Source disponible ici : https://github.com/farias06/Python/blob/master/parse_email.py )

#! /usr/bin/env python3
# ~*~ utf-8 ~*~

# Readme :
# influx user create -n arias -p arias -o cyberneurones-org 

import mailbox
import bs4
import glob
import os
import time
import codecs
import sys
from influxdb import InfluxDBClient
import re
from datetime import datetime
from email.utils import parsedate_to_datetime
import logging
import psycopg2

logger = logging.Logger('catch_all')

#########################

global nb_folder
nb_folder = 0;
global nb_email
nb_email = 0;
global nb_error
nb_error = 0;
global id_email
id_email = 0;

global client
client = InfluxDBClient(host='127.0.0.1', port=8086, username='arias', password='arias')
client.drop_database('thunderbird')
client.create_database('thunderbird')
client.switch_database('thunderbird')

global client2
client2   = psycopg2.connect("dbname=thunderbird user=arias password='arias'")
cursor                = client2.cursor()
global name_Table
name_Table            = "thunderbird"
#sqlCreateTable = "create table "+name_Table+" (id bigint UNIQUE, mail varchar(128), name varchar(128), domain varchar (128), date timestamp);"
sqlCreateTable = "delete from "+name_Table;
cursor.execute(sqlCreateTable)
client2.commit()

#########################

def get_html_text(html):
    try:
        return bs4.BeautifulSoup(html, 'lxml').body.get_text(' ', strip=True)
    except AttributeError: # message contents empty
        return None

class GmailMboxMessage():
    def __init__(self, email_data):
        if not isinstance(email_data, mailbox.mboxMessage):
            raise TypeError('Variable must be type mailbox.mboxMessage')
        self.email_data = email_data

    def parse_email(self):
        global client
        global client2
        global id_email
        global name_Table
        global cursor
        email_date = self.email_data['Date']
        email_from = self.email_data['From']
        email_to = self.email_data['To']
        email_subject = self.email_data['Subject']
        if email_date is not None and email_from is not None:
            mail = re.search(r'[\w\.\-_]+@[\w\.\-_]+', email_from)
            if mail is not None:
                mailstr = mail.group(0)
            if mail is not None:    
                domain = re.search("@[\w\.\-_]+", email_from).group(0)
                domain = domain.replace('@', '')
                domain = domain.replace('>', '')
            if mail is not None:
                user = re.search("[\w\.i\-_]+@", email_from).group(0)
                user = user.replace('@', '')
                user = user.replace('<', '')
            local_time_str = datetime.fromtimestamp(parsedate_to_datetime(email_date).timestamp()).strftime('%Y-%m-%dT%H:%M:%S.%f%z')
            local_time_str2 = datetime.fromtimestamp(parsedate_to_datetime(email_date).timestamp()).strftime('%Y-%m-%d %H:%M:%S')
            timestamp = round(parsedate_to_datetime(email_date).timestamp() * 1000);
            if mail is not None:
                data = [{'measurement': 'thunderbirds', 'tags': { 'fullemail': 1, 'from': email_from, 'mail': mailstr, 'domain': domain, 'user': user}, 'id' : id_email, 'time': timestamp, 'date':local_time_str, 'fields': {"value": 1}}]
                sql = "INSERT INTO "+name_Table+" (mail, domain, name, id, date) VALUES ('" +mailstr+ "','" + domain+"','" +user+"','"+str(id_email)+"',TIMESTAMP '"+local_time_str2+"')";
                #print (sql)
                cursor.execute(sql);
                client2.commit();
            else :
               data = [{'measurement': 'thunderbirds', 'tags': { 'fullemail': 0, 'from': email_from }, 'id' : id_email, 'time': timestamp, 'date':local_time_str, 'fields': {"value": 1}}] 
            #print (data);
            client.write_points(data, time_precision='ms')
            id_email = id_email+1

def mbox_reader(stream):
    data = stream.read()
    text = data.decode(encoding="utf-8")
    return mailbox.mboxMessage(text)

######################### End of library, example of use below

print("\nUsing glob.iglob()") 
for filename in glob.iglob('/home/ZZZZZZZ/snap/thunderbird/common/.thunderbird/ZZZZZZZZZ.default/Mail/Local Folders/**/*', recursive=True): 
    print(filename);
    filename2, file_extension = os.path.splitext(filename);
    print(file_extension + " " + str(len(file_extension)));
    isFile = os.path.isfile(filename)
    if (file_extension != ".msf") and (file_extension != ".sbd") and isFile is True:
        mbox_obj = mailbox.mbox(filename);
        num_entries = len(mbox_obj)
        nb_folder = nb_folder + 1;
        try :
           for idx, email_obj in enumerate(mbox_obj):
               email_data = GmailMboxMessage(email_obj)
               email_data.parse_email()
               nb_email = nb_email + 1;
               print('Parsing email {0} of {1}'.format(idx, num_entries))
        except StopIteration:
           continue
        except Exception as e: 
           logger.error('Failed : '+ str(e))
           nb_error = nb_error+1;
           continue

print('The number of folder :'+str(nb_folder));    
print('The number of email :'+str(nb_email));
print('The number of error : '+str(nb_error));
print('The number in database : '+str(id_email));

L’execution du programme:

...
The number of folder :1321
The number of email :120384
The number of error : 125
The number in database : 113523

Oui j’ai plus de 113523 emails ( environ 30 Go) … misère. Pour information seul les softs open source sont stables avec autant d’email, sous Outlook c’est le crash.

Le résultat :