Merge branch 'feature/background-scrape' into 'dev'

Feature: Background Scrape

See merge request s3lph/icalendar-timeseries-server!1
This commit is contained in:
s3lph 2019-08-21 12:25:06 +00:00
commit 326ecf2a73
8 changed files with 90 additions and 48 deletions

View file

@ -92,7 +92,6 @@ Configuration is done through a JSON config file:
"port": 8090,
"start_delta": "-PT3H",
"end_delta": "P30D",
"cache": "PT15M",
"tz": "Europe/Zurich",
"calendars": {
"private": {
@ -104,6 +103,7 @@ Configuration is done through a JSON config file:
}
},
"public": {
"interval": "P1D",
"url": "https://example.cloud/dav/me/public.ics"
},
"confidential": {
@ -136,11 +136,11 @@ Configuration is done through a JSON config file:
| `port` | int | The port to listen on. |
| `start_delta` | string | A signed ISO 8601 duration string, describing the event range start offset relative to the current time. |
| `end_delta` | string | An unsigned ISO 8601 duration string, describing the event range end offset relative to the current time. |
| `cache` | string | An unsigned ISO 8601 duration string, describing the cache timeout duration. |
| `tz` | string | The local timezone. |
| `calendars` | dict | The calendars to scrape. |
| `keys(calendars)` | string | Name of the calendar. |
| `calendars.*.url` | string | The HTTP or HTTPS URL to scrape. |
| `calendars.*.interval` | string | An unsigned ISO 8601 duration string, describing the scrape interval for this calendar. |
| `calendars.*.ca` | string | Path to the CA certificate file to validate the server's TLS certificate against, in PEM format (optional). |
| `calendars.*.auth` | dict | Authorization config for the calendar. |
| `calendars.*.auth[].type` | string | Authorization type, one of `none` (no authorization), `basic` (HTTP Basic Authentication), `tls` (TLS client certificate). |

View file

@ -7,23 +7,26 @@
"tz": "Europe/Zurich",
"calendars": {
"tlstest": {
"interval": "PT5M",
"url": "https://localhost/private.ics",
"ca": "/home/sebastian/tlstest/ca/ca/ca.crt",
"auth": {
"type": "tls",
"keyfile": "/home/sebastian/tlstest/client/combined.pem"
}
},
"filetest": {
"interval": "PT1M",
"url": "file:///srv/http/private.ics"
}
},
"key_replace": {
"summary": "a_summary",
"description": "b_description",
"calendar": "c_calendar"
"description": "b_description"
},
"value_replace": {
"summary": "{{ summary|truncate(100, end=' \\N{HORIZONTAL ELLIPSIS}') }}",
"description": "{{ description|truncate(100, end=' \\N{HORIZONTAL ELLIPSIS}') }}",
"calendar": "{{ 0 if calendar == 'private' else 1 }}",
"useless_metric": "{{ start.timestamp() + end.timestamp() }}"
}
}

View file

@ -1,29 +1,21 @@
from typing import List
import json
from datetime import datetime
from urllib.error import HTTPError
import traceback
import bottle
from isodate import Duration
from icalendar_timeseries_server.config import get_config
from icalendar_timeseries_server.event import Event
from icalendar_timeseries_server.cal import scrape_calendar
from icalendar_timeseries_server.event import Metric
from icalendar_timeseries_server.cal import get_calendar
from icalendar_timeseries_server.query import MetricQuery
@bottle.route('/api/v1/query')
@bottle.route('/api/v1/query_range')
def prometheus_api():
tz = get_config().tz
now: datetime = datetime.now(tz)
start_delta: Duration = get_config().start_delta
end_delta: Duration = get_config().end_delta
start: datetime = now + start_delta
end: datetime = now + end_delta
events: List[Event] = []
events: List[Metric] = []
try:
q = MetricQuery(bottle.request.query['query'])
@ -39,8 +31,8 @@ def prometheus_api():
return json.dumps(response)
try:
for name, caldef in get_config().calendars.items():
events.extend(scrape_calendar(name, caldef, start, end))
for name in get_config().calendars.keys():
events.extend(get_calendar(name))
events = list(filter(q, events))
events.sort(key=lambda e: e.start)
response = {

View file

@ -1,18 +1,21 @@
from typing import Dict, List, Iterable, Tuple
from typing import Dict, List, Iterable
import sys
import urllib.request
from datetime import datetime, date, timedelta
from threading import Lock, Timer
from dateutil import rrule
from icalendar import cal
from isodate import Duration
from icalendar_timeseries_server import __version__
from icalendar_timeseries_server.config import get_config, CalendarConfig
from icalendar_timeseries_server.event import Event
_SCRAPE_CACHE: Dict[str, Tuple[datetime, List[Event]]] = dict()
_SCRAPE_CACHE: Dict[str, List[Event]] = dict()
_SCRAPE_CACHE_LOCK: Lock = Lock()
__py_version: str = f'{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}'
USER_AGENT: str = f'icalendar-timeseries-server/{__version__} (Python/{__py_version})'
@ -46,8 +49,15 @@ def _parse_recurring(event: cal.Event, start: datetime, end: datetime, duration:
return occurences
def _parse_calendar(name: str, calendar: cal.Calendar, start: datetime, end: datetime) -> List[Event]:
def _scrape_calendar(name: str, config: CalendarConfig, start: datetime, end: datetime):
global _SCRAPE_CACHE, _SCRAPE_CACHE_LOCK
events = []
opener: urllib.request.OpenerDirector = config.get_url_opener()
with opener.open(config.url) as response:
data = response.read().decode('utf-8')
calendar = cal.Calendar.from_ical(data)
for element in calendar.walk():
if element.name == "VEVENT":
dtstart = element.get('dtstart').dt
@ -70,23 +80,34 @@ def _parse_calendar(name: str, calendar: cal.Calendar, start: datetime, end: dat
for occurence in occurences:
if start <= occurence < end:
events.append(Event(name, element, occurence, occurence + duration))
return events
with _SCRAPE_CACHE_LOCK:
_SCRAPE_CACHE[name] = events
def scrape_calendar(name: str, config: CalendarConfig, start: datetime, end: datetime) -> List[Event]:
def scrape_calendar(name: str, config: CalendarConfig):
# Get current time in configured timezone
tz = get_config().tz
now: datetime = datetime.now(tz)
# Reschedule calendar scraping
cron = Timer(config.interval.totimedelta(start=now).total_seconds(),
lambda: scrape_calendar(name, config))
cron.start()
# Compute interval for which to return events
start_delta: Duration = get_config().start_delta
end_delta: Duration = get_config().end_delta
start: datetime = now + start_delta
end: datetime = now + end_delta
# Scrape and parse the calendar
_scrape_calendar(name, config, start, end)
def start_scrape_calendar(name: str, config: CalendarConfig):
# Schedule first calendar scraping
cron = Timer(0, lambda: scrape_calendar(name, config))
cron.start()
def get_calendar(name: str):
global _SCRAPE_CACHE
now: datetime = datetime.now(tz=get_config().tz)
if get_config().cache.total_seconds() > 0 and name in _SCRAPE_CACHE:
cache_timeout, cached = _SCRAPE_CACHE[name]
if now < cache_timeout:
print('serving cached')
return cached
print('doing request')
opener: urllib.request.OpenerDirector = config.get_url_opener()
with opener.open(config.url) as response:
data = response.read().decode('utf-8')
calendar = cal.Calendar.from_ical(data)
parsed: List[Event] = _parse_calendar(name, calendar, start, end)
_SCRAPE_CACHE[name] = now + get_config().cache, parsed
return parsed
with _SCRAPE_CACHE_LOCK:
return _SCRAPE_CACHE.get(name, [])

View file

@ -27,6 +27,7 @@ class CalendarConfig:
def __init__(self, config: Dict[str, Any], config_path: str) -> None:
self._url: str = _keycheck('url', config, str, config_path)
self._scrape_interval: Duration = _parse_timedelta('interval', config, config_path, default_value='PT15M')
self._ca: Optional[str] = _keycheck('ca', config, str, config_path, optional=True)
auth: Dict[str, Any] = _keycheck('auth', config, dict, config_path, default_value={'type': 'none'})
self._authtype: str = _keycheck('type', auth, str, f'{config_path}.auth',
@ -56,6 +57,10 @@ class CalendarConfig:
def url(self) -> str:
return self._url
@property
def interval(self) -> Duration:
return self._scrape_interval
def get_url_opener(self) -> urllib.request.OpenerDirector:
if self._authtype == 'tls':
@ -89,7 +94,6 @@ class Config:
self._tz: pytz.tzinfo = _parse_timezone('tz', config, '', default_value='UTC')
self._start_delta: Duration = _parse_timedelta('start_delta', config, '', default_value='PT')
self._end_delta: Duration = _parse_timedelta('end_delta', config, '', default_value='P30D')
self._cache: Duration = _parse_timedelta('cache', config, '', default_value='PT', force_positive=True)
self._calendars: Dict[str, CalendarConfig] = self._parse_calendars_config('calendars', config, '')
self._key_replace = _parse_key_replace('key_replace', config, '')
self._value_replace = _parse_value_replace('value_replace', config, '')
@ -125,10 +129,6 @@ class Config:
def end_delta(self) -> Duration:
return self._end_delta
@property
def cache(self) -> Duration:
return self._cache
@property
def calendars(self) -> Dict[str, CalendarConfig]:
return self._calendars

View file

@ -38,7 +38,7 @@ class Event(Metric):
for attr in _ATTRIBUTES:
tmp[attr] = event.get(attr, '')
substitution_keys = set(_ATTRIBUTES)
substitution_keys.update(['start', 'end'])
substitution_keys.update(tmp.keys())
substitution_keys.update(get_config().key_replace.keys())
substitution_keys.update(get_config().value_replace.keys())
for attr in substitution_keys:

View file

@ -2,6 +2,7 @@ import sys
import bottle
from icalendar_timeseries_server.cal import start_scrape_calendar
from icalendar_timeseries_server.config import load_config, load_default_config, get_config
# Contains decorated bottle handler function for /api/v1/query
@ -17,7 +18,12 @@ def main():
else:
print(f'Can only read one config file, got "{" ".join(sys.argv[1:])}"')
exit(1)
bottle.run(host=get_config().addr, port=get_config().port)
config = get_config()
# Schedule calendar scraping in the background
for calname in config.calendars.keys():
start_scrape_calendar(calname, config.calendars[calname])
# Start the Bottle HTTP server
bottle.run(host=config.addr, port=get_config().port)
if __name__ == '__main__':

View file

@ -16,7 +16,6 @@ _CONFIG_VALID = """
"port": 8090,
"start_delta": "-PT3H",
"end_delta": "P30D",
"cache": "PT15M",
"tz": "Europe/Zurich",
"calendars": {
"private": {
@ -28,10 +27,12 @@ _CONFIG_VALID = """
}
},
"public": {
"url": "https://example.cloud/dav/me/public.ics"
"url": "https://example.cloud/dav/me/public.ics",
"interval": "P1D"
},
"confidential": {
"url": "https://example.cloud/dav/me/confidential.ics",
"interval": "PT5M",
"ca": "/etc/ssl/ca.pem",
"auth": {
"type": "tls",
@ -124,5 +125,24 @@ class ConfigTest(unittest.TestCase):
self.assertEqual(config.port, 8090)
self.assertEqual(config.start_delta, Duration(hours=-3))
self.assertEqual(config.end_delta, Duration(days=30))
self.assertEqual(config.cache, Duration(minutes=15))
self.assertEqual(config.tz, pytz.timezone('Europe/Zurich'))
def test_parse_calendars(self):
config = Config(json.loads(_CONFIG_VALID))
self.assertEqual({'public', 'private', 'confidential'}, config.calendars.keys())
self.assertEqual('https://example.cloud/dav/me/public.ics', config.calendars['public'].url)
self.assertEqual(Duration(days=1), config.calendars['public'].interval)
self.assertEqual('none', config.calendars['public']._authtype)
self.assertEqual('https://example.cloud/dav/me/private.ics', config.calendars['private'].url)
self.assertEqual(Duration(minutes=15), config.calendars['private'].interval)
self.assertEqual('basic', config.calendars['private']._authtype)
self.assertEqual('Basic bWU6bXlzdXBlcnNlY3VyZXBhc3N3b3Jk',
config.calendars['private']._request_headers['Authorization'])
self.assertEqual('https://example.cloud/dav/me/confidential.ics', config.calendars['confidential'].url)
self.assertEqual(Duration(minutes=5), config.calendars['confidential'].interval)
self.assertEqual('tls', config.calendars['confidential']._authtype)
self.assertEqual('/etc/ssl/client.pem', config.calendars['confidential']._tls_keyfile)
self.assertEqual('mysupersecurepassword', config.calendars['confidential']._tls_passphrase)