From 3cbd247a71a8f4d4b353f67330cb3974e9d9d109 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ma=C5=9Blanka?= <piotr.maslanka@henrietta.com.pl>
Date: Thu, 26 Aug 2021 23:20:25 +0200
Subject: [PATCH] minor fixes and refactors

---
 .gitlab-ci.yml      |  4 +--
 COPYING.md          |  1 +
 NOTES.md            | 66 +++++++++++++++++++++++++++++++++++++++++++--
 README.md           | 11 +++++---
 agent/middleware.py |  7 ++---
 counting/models.py  |  9 +++----
 netguru/settings.py | 48 +++++++++++++++------------------
 shares/models.py    | 14 +++-------
 8 files changed, 107 insertions(+), 53 deletions(-)
 create mode 100644 COPYING.md

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e817dac..8582b7e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -17,14 +17,14 @@ unittest:
 
 build_nginx:
   script:
-    - docker-it nginx zoo.smok.co/henrietta/netguru/nginx deployment/nginx
+    - DOCKERIT_NO_BRANCH=1 docker-it nginx zoo.smok.co/henrietta/netguru/nginx deployment/nginx
   only:
     - master
 
 build:
   stage: build
   script:
-    - docker-it netguru zoo.smok.co/henrietta/netguru/backend . --target runtime
+    - DOCKERIT_NO_BRANCH=1 docker-it netguru zoo.smok.co/henrietta/netguru/backend . --target runtime
 
 deploy:
   stage: deploy
diff --git a/COPYING.md b/COPYING.md
new file mode 100644
index 0000000..dba62d6
--- /dev/null
+++ b/COPYING.md
@@ -0,0 +1 @@
+Copyright (c) 2021 Piotr Maślanka. All rights reserved.
diff --git a/NOTES.md b/NOTES.md
index 381ddca..c83a172 100644
--- a/NOTES.md
+++ b/NOTES.md
@@ -14,7 +14,8 @@ as such:
 ```
 
 Because keying them together by day is going to make the thing
-a lot harder for the frontend programmer to parse.
+harder for the frontend programmer to parse, and I suppose he 
+needs the backend as accomodating as possible.
 
 I know that I'm ignoring specification, and you are free to call me out 
 on that - but as I got some experience with frontend, 
@@ -34,6 +35,9 @@ to apply them trumps even that.
 Submissions are to follow `/api/add`, because DRF happened to generate
 nice documentation there and not for Swagger.
 
+Note that the Swagger endpoints are available only in DEBUG mode, 
+so they are unavailable on production.
+
 ## Authorization
 
 You can authorize your API requests either via Django cookies
@@ -50,10 +54,68 @@ I realize that I would be best practice to deduplicate some code contained
 within tests, but since I'm running about 1,5 of full-time equvialents you just have to forgive me
 for sparing the effort to do so. Thanks "from the mountain!"
 
-# The reaper job
+# The [reaper job](counting/cron.py)
 
 I came up with the reaper job trying to think up a reasonable solution
 that wouldn't load the processing server too much. It processes way-past-due
 links into a nice compact representation and stores them in the database.
 Two last days (since the job fires only every 24 hours) are computed on-the-fly, 
 and cached as necessary, for up to 5 minutes.
+
+# Nginx serving static content
+
+Since it's not ideal for Django to be serving large static files, 
+I tried to optimize it as much as possible by using a streaming iterator.
+
+However, in real-life I'd think of some authentication mechanism
+to be handled purely by nginx (perhaps a JWT issued for a short time?),
+and simply let it serve both the static files and the uploaded files.
+
+Right now I'm serving them using the same Django, but keep
+in mind that I fully realize that's not optimal. Perhaps looks
+like a use case for a Kubernetes pod (Django+Nginx running in the same pod?).
+
+# Checking for [existing UUIDs](shares/models.py#L140)
+
+I know that UUID collisions practically don't happen, but since it's so cheap for us
+to go that extra mile
+and since we're at risk at overwriting somebody's files, I guess i should check it.
+
+I realize that it's susceptible to race conditions, but then again what is the chance?
+The chance that UUID will repeat during the 24 hours a link is active is much higher
+than the chance that it will repeat during the same 2 second window where
+both users upload their files. If we had to plan for this 
+amazing contingency, it would require strong locking guarantees.
+
+Since however the solution is written as a single process, it's actually
+quite simple to implement. However, the chance that I get struck by a thunder
+seven times over is much higher than this happening.
+
+# Processes vs threads
+
+I've written the solution as a single process, since it binds port 81
+to export metrics. I realize I could export them as a part of normal Django
+flow, and my library certainly allows for that. It's mainly a question of preference.
+
+I would scale this solution by spawning additional replicas and gathering metrics
+from each one of them, by adding some service discovery mechanism for Prometheus
+to pick them up.
+
+# Checking for file [existence before deletion](shares/models.py#L127)
+
+Since we use ATOMIC_REQUESTS some files might be deleted, but the respective records
+might still be around. Linux doesn't have distributed transactional filesystem 
+functionality (not a production-ready one that I'm aware of), and requiring the sysadmin
+to provide us with a transactional FS is overkill.
+
+# Waiting for Postgres to start up with a [delay](test.sh)
+
+Normally I'd write something along lines of
+[wait-for-cassandra](https://github.com/smok-serwis/wait-for-cassandra) 
+r
+[wait-for-amqp](https://github.com/smok-serwis/wait-for-amqp) for
+services that take a really long while to start up, but since Postgres
+is rather fast on that and this is a throw-away solution I didn't see the need to solve 
+it any other way.
+
+I realize that on particularly slow CI servers the build will fail.
diff --git a/README.md b/README.md
index 416509f..7c5bc6e 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,8 @@ Netguru
 
 This contains a solution of the Netguru's recruiment task.
 
-Please read the [notes](NOTES.md).
+Please read the [notes](NOTES.md). I've detailed there any questionable design choices
+and my motivation behind them.
 
 # Local development
 
@@ -70,6 +71,8 @@ provide it, a reasonably default value is supplied,
 but since it has already been used it isn't safe anymore.
 **CHANGE IT!**
 
+And don't you even try funny business - it's changed on production :D
+
 You also need to provide a list of hosts from which
 this Django instance can be accessed. You do that
 by defining an environment variable called 
@@ -77,13 +80,13 @@ ALLOWED_HOSTS and giving a list of hosts separated
 by a comma. If you don't define it, a default 
 value of `*` will be used
 
+**WARNING!!** This is not secure, don't do it!
+
 You can optionally provide the environment variable
 `REDIS_CACHE_HOST` that should contain a hostname:port of
 a Redis cache instance, if you mean to configure Redis caching.
 It will be disabled by default.
 
-**WARNING!!** This is not secure, don't do it!
-
 ### Volumes
 
 The application needs a volume marked at 
@@ -97,7 +100,7 @@ files in a database is an antipattern.
 You will just have to forgive me that I failed
 to hook up any loggers for it. Any Python-based
 logger will do, I sincerely recommends
-[seq-log](https://github.com/tintoy/seqlog)
+[seqlog](https://github.com/tintoy/seqlog)
 - full disclosure: I'm a [contributor](https://github.com/tintoy/seqlog/blob/master/AUTHORS.rst) there
 
 #### Metrics
diff --git a/agent/middleware.py b/agent/middleware.py
index b5efb0e..49771f9 100644
--- a/agent/middleware.py
+++ b/agent/middleware.py
@@ -7,12 +7,13 @@ class UANotingMiddleware:
 
     def __call__(self, request):
         response = self.get_response(request)
-        if request.user.is_authenticated:
+        if request.user.is_authenticated and request.headers.get('User-Agent'):
+            user_agent = request.headers['User-Agent']
             try:
                 ua = UserAgentStorage.objects.get(user=request.user)
-                ua.ua = request.headers.get('User-Agent')
+                ua.ua = user_agent
             except UserAgentStorage.DoesNotExist:
-                ua = UserAgentStorage(user=request.user, ua=request.headers.get('User-Agent'))
+                ua = UserAgentStorage(user=request.user, ua=user_agent)
             ua.save()
 
         return response
diff --git a/counting/models.py b/counting/models.py
index 3671a88..74fe67d 100644
--- a/counting/models.py
+++ b/counting/models.py
@@ -3,13 +3,12 @@ from django.db import models
 
 class StoryOfADay(models.Model):
     day = models.DateField(verbose_name='Date', primary_key=True)
-    links = models.IntegerField(verbose_name='Links visited')
-    files = models.IntegerField(verbose_name='Files visited')
+    links = models.IntegerField(verbose_name='Amount of links visited')
+    files = models.IntegerField(verbose_name='Amount of files visited')
 
+    # it's hashable and eq-able in order to belong to a set
     def __hash__(self):
         return hash(self.day)
 
     def __eq__(self, other):
-        if isinstance(other, StoryOfADay):
-            other = other.day
-        return self.day == other
+        return self.day == other.day
diff --git a/netguru/settings.py b/netguru/settings.py
index 746d044..801da6b 100644
--- a/netguru/settings.py
+++ b/netguru/settings.py
@@ -7,9 +7,6 @@ from satella.instrumentation.metrics.exporters import PrometheusHTTPExporterThre
 
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
-# Quick-start development settings - unsuitable for production
-# See https://docs.djangoproject.com/en/3.0/howto/deployment/checklist/
-
 # SECURITY WARNING: keep the secret key used in production secret!
 SECRET_KEY = os.environ.get('SECRET_KEY', 'r6(!w-%glre916esjft0^lds4u)=fym=v*26$@*&y7qcssfi%j')
 
@@ -35,26 +32,8 @@ INSTALLED_APPS = [
 
 CRON_CLASSES = [
     "counting.cron.ReaperJob",
-    # ...
 ]
 
-REST_FRAMEWORK = {
-    'TEST_REQUEST_DEFAULT_FORMAT': 'json',
-    'DEFAULT_SCHEMA_CLASS': 'rest_framework.schemas.coreapi.AutoSchema',
-    'DEFAULT_PARSER_CLASSES': (
-        'rest_framework.parsers.JSONParser',
-    ),
-    'DEFAULT_AUTHENTICATION_CLASSES': [
-        'rest_framework.authentication.SessionAuthentication',
-        'rest_framework.authentication.BasicAuthentication'
-    ],
-    'DEFAULT_RENDERER_CLASSES': (
-        'rest_framework.renderers.JSONRenderer',
-        'rest_framework.renderers.BrowsableAPIRenderer',
-    ),
-    'UNICODE_JSON': True,
-}
-
 MIDDLEWARE = [
     'django_satella_metrics.DjangoSatellaMetricsMiddleware',
     'django_opentracing.OpenTracingMiddleware',
@@ -144,6 +123,7 @@ STATIC_ROOT = os.path.join(BASE_DIR, 'media')
 FILE_SAVE_LOCATION = '/data'
 
 # Configure cache
+# ===============
 if os.environ.get('REDIS_CACHE_HOST'):
     CACHES = {
         'default': {
@@ -152,8 +132,26 @@ if os.environ.get('REDIS_CACHE_HOST'):
         },
     }
 
-# Configure DRF and Swagger
-DEFAULT_SCHEMA_CLASS = 'rest_framework.schemas.openapi.AutoSchema'
+# Configure DRF
+# =============
+
+REST_FRAMEWORK = {
+    'TEST_REQUEST_DEFAULT_FORMAT': 'json',
+    'DEFAULT_SCHEMA_CLASS': 'rest_framework.schemas.coreapi.AutoSchema',
+    'DEFAULT_PARSER_CLASSES': (
+        'rest_framework.parsers.JSONParser',
+    ),
+    'DEFAULT_AUTHENTICATION_CLASSES': [
+        'rest_framework.authentication.SessionAuthentication',
+        'rest_framework.authentication.BasicAuthentication'
+    ],
+    'DEFAULT_RENDERER_CLASSES': (
+        'rest_framework.renderers.JSONRenderer',
+        'rest_framework.renderers.BrowsableAPIRenderer',
+    ),
+    'UNICODE_JSON': True,
+}
+
 
 # Configure tracing
 # =================
@@ -183,8 +181,6 @@ phet.start()
 # =================
 
 logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.DEBUG if DEBUG else logging.INFO)
 if DEBUG:
-    logging.basicConfig(level=logging.DEBUG)
     logger.warning('WARNING! You are running in debug mode. Don\'t do it on production!')
-else:
-    logging.basicConfig(level=logging.INFO)
diff --git a/shares/models.py b/shares/models.py
index 694c224..4b4d60e 100644
--- a/shares/models.py
+++ b/shares/models.py
@@ -125,10 +125,6 @@ class Share(models.Model):
     def delete(self, *args, **kwargs):
         if self.share_type == SHARE_FILE:
             path = os.path.join(FILE_SAVE_LOCATION, self.uuid_name)
-            # Since we might have already deleted it, but our transaction was rolled
-            # back because of reasons.
-            # This is both EAFP (more Pythonic) and shorter 2 lines than a try ... catch
-            # besides it's a great showcase of my Satella
             with silence_excs(FileNotFoundError):
                 os.unlink(path)
             files_deleted.runtime(+1)
@@ -143,12 +139,6 @@ def hash_password(password: str) -> str:
 
 def generate_correct_uuid() -> str:
     f = uuid.uuid4().hex
-    # I know that UUID collisions practically don't happen, but since it's so cheap for us
-    # to go that extra mile
-    # and since we're at risk at overwriting somebody's files, I guess i should check it
-    # I realize that it's susceptible to race conditions, but then again what is the chance?
-    # The chance that UUID will repeat during the 24 hours a link is active is much higher
-    # than the chance that it will repeat during the same 2 second window
     while os.path.exists(os.path.join(FILE_SAVE_LOCATION, f)):
         f = uuid.uuid4().hex
     return f
@@ -160,6 +150,7 @@ def add_file(file: UploadedFile, request, escape_url: bool = True) -> dict:
 
     :param file: file to add
     :param request: request to use
+    :param escape_url: whether to mark the URL for Django template output as safe
     :return: a dictionary with following keys (password, url, type)
     """
     data_added = {'password': User.objects.make_random_password(), 'type': 'file'}
@@ -186,8 +177,9 @@ def add_url(url: str, request, escape_url: bool = True) -> dict:
     """
     Adds a new URL
 
-    :param file: file to add
+    :param url: URL to add
     :param request: request to use
+    :param escape_url: whether to mark the URL for Django template output as safe
     :return: a dictionary with following keys (password, url, type)
     """
     data_added = {'password': User.objects.make_random_password(), 'type': 'file'}
-- 
GitLab