Merge branch 'feature/background-scrape' into 'dev'

Feature: Background Scrape

See merge request s3lph/icalendar-timeseries-server!1
This commit is contained in:
s3lph 2019-08-21 12:25:06 +00:00
commit 326ecf2a73
8 changed files with 90 additions and 48 deletions

View file

@ -92,7 +92,6 @@ Configuration is done through a JSON config file:
"port": 8090, "port": 8090,
"start_delta": "-PT3H", "start_delta": "-PT3H",
"end_delta": "P30D", "end_delta": "P30D",
"cache": "PT15M",
"tz": "Europe/Zurich", "tz": "Europe/Zurich",
"calendars": { "calendars": {
"private": { "private": {
@ -104,6 +103,7 @@ Configuration is done through a JSON config file:
} }
}, },
"public": { "public": {
"interval": "P1D",
"url": "https://example.cloud/dav/me/public.ics" "url": "https://example.cloud/dav/me/public.ics"
}, },
"confidential": { "confidential": {
@ -136,11 +136,11 @@ Configuration is done through a JSON config file:
| `port` | int | The port to listen on. | | `port` | int | The port to listen on. |
| `start_delta` | string | A signed ISO 8601 duration string, describing the event range start offset relative to the current time. | | `start_delta` | string | A signed ISO 8601 duration string, describing the event range start offset relative to the current time. |
| `end_delta` | string | An unsigned ISO 8601 duration string, describing the event range end offset relative to the current time. | | `end_delta` | string | An unsigned ISO 8601 duration string, describing the event range end offset relative to the current time. |
| `cache` | string | An unsigned ISO 8601 duration string, describing the cache timeout duration. |
| `tz` | string | The local timezone. | | `tz` | string | The local timezone. |
| `calendars` | dict | The calendars to scrape. | | `calendars` | dict | The calendars to scrape. |
| `keys(calendars)` | string | Name of the calendar. | | `keys(calendars)` | string | Name of the calendar. |
| `calendars.*.url` | string | The HTTP or HTTPS URL to scrape. | | `calendars.*.url` | string | The HTTP or HTTPS URL to scrape. |
| `calendars.*.interval` | string | An unsigned ISO 8601 duration string, describing the scrape interval for this calendar. |
| `calendars.*.ca` | string | Path to the CA certificate file to validate the server's TLS certificate against, in PEM format (optional). | | `calendars.*.ca` | string | Path to the CA certificate file to validate the server's TLS certificate against, in PEM format (optional). |
| `calendars.*.auth` | dict | Authorization config for the calendar. | | `calendars.*.auth` | dict | Authorization config for the calendar. |
| `calendars.*.auth[].type` | string | Authorization type, one of `none` (no authorization), `basic` (HTTP Basic Authentication), `tls` (TLS client certificate). | | `calendars.*.auth[].type` | string | Authorization type, one of `none` (no authorization), `basic` (HTTP Basic Authentication), `tls` (TLS client certificate). |

View file

@ -7,23 +7,26 @@
"tz": "Europe/Zurich", "tz": "Europe/Zurich",
"calendars": { "calendars": {
"tlstest": { "tlstest": {
"interval": "PT5M",
"url": "https://localhost/private.ics", "url": "https://localhost/private.ics",
"ca": "/home/sebastian/tlstest/ca/ca/ca.crt", "ca": "/home/sebastian/tlstest/ca/ca/ca.crt",
"auth": { "auth": {
"type": "tls", "type": "tls",
"keyfile": "/home/sebastian/tlstest/client/combined.pem" "keyfile": "/home/sebastian/tlstest/client/combined.pem"
} }
},
"filetest": {
"interval": "PT1M",
"url": "file:///srv/http/private.ics"
} }
}, },
"key_replace": { "key_replace": {
"summary": "a_summary", "summary": "a_summary",
"description": "b_description", "description": "b_description"
"calendar": "c_calendar"
}, },
"value_replace": { "value_replace": {
"summary": "{{ summary|truncate(100, end=' \\N{HORIZONTAL ELLIPSIS}') }}", "summary": "{{ summary|truncate(100, end=' \\N{HORIZONTAL ELLIPSIS}') }}",
"description": "{{ description|truncate(100, end=' \\N{HORIZONTAL ELLIPSIS}') }}", "description": "{{ description|truncate(100, end=' \\N{HORIZONTAL ELLIPSIS}') }}",
"calendar": "{{ 0 if calendar == 'private' else 1 }}",
"useless_metric": "{{ start.timestamp() + end.timestamp() }}" "useless_metric": "{{ start.timestamp() + end.timestamp() }}"
} }
} }

View file

@ -1,29 +1,21 @@
from typing import List from typing import List
import json import json
from datetime import datetime
from urllib.error import HTTPError from urllib.error import HTTPError
import traceback import traceback
import bottle import bottle
from isodate import Duration
from icalendar_timeseries_server.config import get_config from icalendar_timeseries_server.config import get_config
from icalendar_timeseries_server.event import Event from icalendar_timeseries_server.event import Metric
from icalendar_timeseries_server.cal import scrape_calendar from icalendar_timeseries_server.cal import get_calendar
from icalendar_timeseries_server.query import MetricQuery from icalendar_timeseries_server.query import MetricQuery
@bottle.route('/api/v1/query') @bottle.route('/api/v1/query')
@bottle.route('/api/v1/query_range') @bottle.route('/api/v1/query_range')
def prometheus_api(): def prometheus_api():
tz = get_config().tz events: List[Metric] = []
now: datetime = datetime.now(tz)
start_delta: Duration = get_config().start_delta
end_delta: Duration = get_config().end_delta
start: datetime = now + start_delta
end: datetime = now + end_delta
events: List[Event] = []
try: try:
q = MetricQuery(bottle.request.query['query']) q = MetricQuery(bottle.request.query['query'])
@ -39,8 +31,8 @@ def prometheus_api():
return json.dumps(response) return json.dumps(response)
try: try:
for name, caldef in get_config().calendars.items(): for name in get_config().calendars.keys():
events.extend(scrape_calendar(name, caldef, start, end)) events.extend(get_calendar(name))
events = list(filter(q, events)) events = list(filter(q, events))
events.sort(key=lambda e: e.start) events.sort(key=lambda e: e.start)
response = { response = {

View file

@ -1,18 +1,21 @@
from typing import Dict, List, Iterable, Tuple from typing import Dict, List, Iterable
import sys import sys
import urllib.request import urllib.request
from datetime import datetime, date, timedelta from datetime import datetime, date, timedelta
from threading import Lock, Timer
from dateutil import rrule from dateutil import rrule
from icalendar import cal from icalendar import cal
from isodate import Duration
from icalendar_timeseries_server import __version__ from icalendar_timeseries_server import __version__
from icalendar_timeseries_server.config import get_config, CalendarConfig from icalendar_timeseries_server.config import get_config, CalendarConfig
from icalendar_timeseries_server.event import Event from icalendar_timeseries_server.event import Event
_SCRAPE_CACHE: Dict[str, Tuple[datetime, List[Event]]] = dict() _SCRAPE_CACHE: Dict[str, List[Event]] = dict()
_SCRAPE_CACHE_LOCK: Lock = Lock()
__py_version: str = f'{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}' __py_version: str = f'{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}'
USER_AGENT: str = f'icalendar-timeseries-server/{__version__} (Python/{__py_version})' USER_AGENT: str = f'icalendar-timeseries-server/{__version__} (Python/{__py_version})'
@ -46,8 +49,15 @@ def _parse_recurring(event: cal.Event, start: datetime, end: datetime, duration:
return occurences return occurences
def _parse_calendar(name: str, calendar: cal.Calendar, start: datetime, end: datetime) -> List[Event]: def _scrape_calendar(name: str, config: CalendarConfig, start: datetime, end: datetime):
global _SCRAPE_CACHE, _SCRAPE_CACHE_LOCK
events = [] events = []
opener: urllib.request.OpenerDirector = config.get_url_opener()
with opener.open(config.url) as response:
data = response.read().decode('utf-8')
calendar = cal.Calendar.from_ical(data)
for element in calendar.walk(): for element in calendar.walk():
if element.name == "VEVENT": if element.name == "VEVENT":
dtstart = element.get('dtstart').dt dtstart = element.get('dtstart').dt
@ -70,23 +80,34 @@ def _parse_calendar(name: str, calendar: cal.Calendar, start: datetime, end: dat
for occurence in occurences: for occurence in occurences:
if start <= occurence < end: if start <= occurence < end:
events.append(Event(name, element, occurence, occurence + duration)) events.append(Event(name, element, occurence, occurence + duration))
return events with _SCRAPE_CACHE_LOCK:
_SCRAPE_CACHE[name] = events
def scrape_calendar(name: str, config: CalendarConfig, start: datetime, end: datetime) -> List[Event]: def scrape_calendar(name: str, config: CalendarConfig):
# Get current time in configured timezone
tz = get_config().tz
now: datetime = datetime.now(tz)
# Reschedule calendar scraping
cron = Timer(config.interval.totimedelta(start=now).total_seconds(),
lambda: scrape_calendar(name, config))
cron.start()
# Compute interval for which to return events
start_delta: Duration = get_config().start_delta
end_delta: Duration = get_config().end_delta
start: datetime = now + start_delta
end: datetime = now + end_delta
# Scrape and parse the calendar
_scrape_calendar(name, config, start, end)
def start_scrape_calendar(name: str, config: CalendarConfig):
# Schedule first calendar scraping
cron = Timer(0, lambda: scrape_calendar(name, config))
cron.start()
def get_calendar(name: str):
global _SCRAPE_CACHE global _SCRAPE_CACHE
now: datetime = datetime.now(tz=get_config().tz) with _SCRAPE_CACHE_LOCK:
if get_config().cache.total_seconds() > 0 and name in _SCRAPE_CACHE: return _SCRAPE_CACHE.get(name, [])
cache_timeout, cached = _SCRAPE_CACHE[name]
if now < cache_timeout:
print('serving cached')
return cached
print('doing request')
opener: urllib.request.OpenerDirector = config.get_url_opener()
with opener.open(config.url) as response:
data = response.read().decode('utf-8')
calendar = cal.Calendar.from_ical(data)
parsed: List[Event] = _parse_calendar(name, calendar, start, end)
_SCRAPE_CACHE[name] = now + get_config().cache, parsed
return parsed

View file

@ -27,6 +27,7 @@ class CalendarConfig:
def __init__(self, config: Dict[str, Any], config_path: str) -> None: def __init__(self, config: Dict[str, Any], config_path: str) -> None:
self._url: str = _keycheck('url', config, str, config_path) self._url: str = _keycheck('url', config, str, config_path)
self._scrape_interval: Duration = _parse_timedelta('interval', config, config_path, default_value='PT15M')
self._ca: Optional[str] = _keycheck('ca', config, str, config_path, optional=True) self._ca: Optional[str] = _keycheck('ca', config, str, config_path, optional=True)
auth: Dict[str, Any] = _keycheck('auth', config, dict, config_path, default_value={'type': 'none'}) auth: Dict[str, Any] = _keycheck('auth', config, dict, config_path, default_value={'type': 'none'})
self._authtype: str = _keycheck('type', auth, str, f'{config_path}.auth', self._authtype: str = _keycheck('type', auth, str, f'{config_path}.auth',
@ -56,6 +57,10 @@ class CalendarConfig:
def url(self) -> str: def url(self) -> str:
return self._url return self._url
@property
def interval(self) -> Duration:
return self._scrape_interval
def get_url_opener(self) -> urllib.request.OpenerDirector: def get_url_opener(self) -> urllib.request.OpenerDirector:
if self._authtype == 'tls': if self._authtype == 'tls':
@ -89,7 +94,6 @@ class Config:
self._tz: pytz.tzinfo = _parse_timezone('tz', config, '', default_value='UTC') self._tz: pytz.tzinfo = _parse_timezone('tz', config, '', default_value='UTC')
self._start_delta: Duration = _parse_timedelta('start_delta', config, '', default_value='PT') self._start_delta: Duration = _parse_timedelta('start_delta', config, '', default_value='PT')
self._end_delta: Duration = _parse_timedelta('end_delta', config, '', default_value='P30D') self._end_delta: Duration = _parse_timedelta('end_delta', config, '', default_value='P30D')
self._cache: Duration = _parse_timedelta('cache', config, '', default_value='PT', force_positive=True)
self._calendars: Dict[str, CalendarConfig] = self._parse_calendars_config('calendars', config, '') self._calendars: Dict[str, CalendarConfig] = self._parse_calendars_config('calendars', config, '')
self._key_replace = _parse_key_replace('key_replace', config, '') self._key_replace = _parse_key_replace('key_replace', config, '')
self._value_replace = _parse_value_replace('value_replace', config, '') self._value_replace = _parse_value_replace('value_replace', config, '')
@ -125,10 +129,6 @@ class Config:
def end_delta(self) -> Duration: def end_delta(self) -> Duration:
return self._end_delta return self._end_delta
@property
def cache(self) -> Duration:
return self._cache
@property @property
def calendars(self) -> Dict[str, CalendarConfig]: def calendars(self) -> Dict[str, CalendarConfig]:
return self._calendars return self._calendars

View file

@ -38,7 +38,7 @@ class Event(Metric):
for attr in _ATTRIBUTES: for attr in _ATTRIBUTES:
tmp[attr] = event.get(attr, '') tmp[attr] = event.get(attr, '')
substitution_keys = set(_ATTRIBUTES) substitution_keys = set(_ATTRIBUTES)
substitution_keys.update(['start', 'end']) substitution_keys.update(tmp.keys())
substitution_keys.update(get_config().key_replace.keys()) substitution_keys.update(get_config().key_replace.keys())
substitution_keys.update(get_config().value_replace.keys()) substitution_keys.update(get_config().value_replace.keys())
for attr in substitution_keys: for attr in substitution_keys:

View file

@ -2,6 +2,7 @@ import sys
import bottle import bottle
from icalendar_timeseries_server.cal import start_scrape_calendar
from icalendar_timeseries_server.config import load_config, load_default_config, get_config from icalendar_timeseries_server.config import load_config, load_default_config, get_config
# Contains decorated bottle handler function for /api/v1/query # Contains decorated bottle handler function for /api/v1/query
@ -17,7 +18,12 @@ def main():
else: else:
print(f'Can only read one config file, got "{" ".join(sys.argv[1:])}"') print(f'Can only read one config file, got "{" ".join(sys.argv[1:])}"')
exit(1) exit(1)
bottle.run(host=get_config().addr, port=get_config().port) config = get_config()
# Schedule calendar scraping in the background
for calname in config.calendars.keys():
start_scrape_calendar(calname, config.calendars[calname])
# Start the Bottle HTTP server
bottle.run(host=config.addr, port=get_config().port)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -16,7 +16,6 @@ _CONFIG_VALID = """
"port": 8090, "port": 8090,
"start_delta": "-PT3H", "start_delta": "-PT3H",
"end_delta": "P30D", "end_delta": "P30D",
"cache": "PT15M",
"tz": "Europe/Zurich", "tz": "Europe/Zurich",
"calendars": { "calendars": {
"private": { "private": {
@ -28,10 +27,12 @@ _CONFIG_VALID = """
} }
}, },
"public": { "public": {
"url": "https://example.cloud/dav/me/public.ics" "url": "https://example.cloud/dav/me/public.ics",
"interval": "P1D"
}, },
"confidential": { "confidential": {
"url": "https://example.cloud/dav/me/confidential.ics", "url": "https://example.cloud/dav/me/confidential.ics",
"interval": "PT5M",
"ca": "/etc/ssl/ca.pem", "ca": "/etc/ssl/ca.pem",
"auth": { "auth": {
"type": "tls", "type": "tls",
@ -124,5 +125,24 @@ class ConfigTest(unittest.TestCase):
self.assertEqual(config.port, 8090) self.assertEqual(config.port, 8090)
self.assertEqual(config.start_delta, Duration(hours=-3)) self.assertEqual(config.start_delta, Duration(hours=-3))
self.assertEqual(config.end_delta, Duration(days=30)) self.assertEqual(config.end_delta, Duration(days=30))
self.assertEqual(config.cache, Duration(minutes=15))
self.assertEqual(config.tz, pytz.timezone('Europe/Zurich')) self.assertEqual(config.tz, pytz.timezone('Europe/Zurich'))
def test_parse_calendars(self):
config = Config(json.loads(_CONFIG_VALID))
self.assertEqual({'public', 'private', 'confidential'}, config.calendars.keys())
self.assertEqual('https://example.cloud/dav/me/public.ics', config.calendars['public'].url)
self.assertEqual(Duration(days=1), config.calendars['public'].interval)
self.assertEqual('none', config.calendars['public']._authtype)
self.assertEqual('https://example.cloud/dav/me/private.ics', config.calendars['private'].url)
self.assertEqual(Duration(minutes=15), config.calendars['private'].interval)
self.assertEqual('basic', config.calendars['private']._authtype)
self.assertEqual('Basic bWU6bXlzdXBlcnNlY3VyZXBhc3N3b3Jk',
config.calendars['private']._request_headers['Authorization'])
self.assertEqual('https://example.cloud/dav/me/confidential.ics', config.calendars['confidential'].url)
self.assertEqual(Duration(minutes=5), config.calendars['confidential'].interval)
self.assertEqual('tls', config.calendars['confidential']._authtype)
self.assertEqual('/etc/ssl/client.pem', config.calendars['confidential']._tls_keyfile)
self.assertEqual('mysupersecurepassword', config.calendars['confidential']._tls_passphrase)