It may be useful if we need to look into the logs (let's say there's a bug in prod env and we need to fix, we only have logs to see what failed), we can filter them and look for the errors.
from gopro2gpx.main import extract
def extract_gpx_from_gopro(media_path: str, *, format: str = "GPX", binary: bool = False) -> str:
""" Extract gpx data from a Go-Pro file
Parameters
----------
media_path: path to Go-Pro media
format: output format for spatial coordinates
binary: whether media is binary
Returns
-------
gpx_path: path of coordinates created file
"""
# todo: should we infer whether or not input file is binary?
gpx_path = f'{media_path}.{format.lower()}'
extract(input_file=media_path, output_file=gpx_path, format=format, binary=binary, verbose=0, skip=False) # keep skip to false to be able to catch errors
return gpx_path
from moviepy.editor import VideoFileClip
def get_media_duration(media_path: str) -> float:
""" Extract duration from a media file
Parameters
----------
media_path: Path to media
Returns
-------
duration: duration of the media
Notes:
------
Taken from here:
https://www.reddit.com/r/moviepy/comments/2bsnrq/is_it_possible_to_get_the_length_of_a_video/
"""
clip = VideoFileClip(media_path)
return clip.duration
If I understand right, your final goal is to convert gps format to 2154 geometry, given a list of coordinates. So, I suggest to simplify the 4 functions longLat2shapePoint, longLat2shapeList ,
geometryTransfo and gps2154 into the following:
def add_geom_to_gps_point(gps_point: dict, source_epsg: int = 4326, target_epsg: int = 2154):
""" Add projection to a given geometry to a GPS point
Parameters
----------
gps_point: dict with gps point containing Longitude and Latitude
"""
project = partial(
pyproj.transform,
pyproj.Proj(init=f"epsg:{source_epsg}"), # source coordinate system
pyproj.Proj(init=f"epsg:{target_epsg}"),
) # destination coordinate system
geo1 = Point(gps_point["Longitude"], gps_point["Latitude"])
gps_point["the_geom"] = transform(project, geo1)
def add_geom_to_gps_points(gps_points: list, **kwargs):
""" Add projection to a given geometry to a GPS list of points
Parameters
----------
gps_points: list of dict with gps point containing Longitude and Latitude
"""
for gps_point in gps_points:
add_geom_to_gps_point(gps_point, **kwargs)
## On Blob helpers (blob_ops.py)
- https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/blob_ops.py#L24-L40
I'm not sure this function really 'provides' any info about the blob since it only print some stuff. Do we really need it ? /!\ `blob_video_prop_keys` is defined but not used. If the only purpose is to keep those info printed in the log, I suggest we do this in function 'downloadBlob' with level 'debug'. What do you think?
- https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/blob_ops.py#L5-L21 Here, you catch all exceptions but we could distinguish two major error : `conn_string` is erroneous or container with name `container_name` does not exist; So we could catch those two in particular, so that the message is more informative. I suggest :
def safe_blob(func):
@functools.wraps(func)
def wrapper(*args, *kwargs):
try:
print('calling', func, 'with', args, kwargs)
return func(args, **kwargs)
except ValueError:
logger.warning('Could not connect to blob storage. '
'Connection string is either blank or malformed.'
'Early exit of ETL process !')
exit()
except HttpResponseError as e:
logger.warning('Could not List blob, the container probably does not exist. '
f'Details : {e} '
'Early exit of ETL process !')
exit()
return wrapper
@safe_blob
def list_container_blob_names(conn_str: str, container_name: str) -> list:
""" List blob names in a given container
Parameters
----------
conn_str: A connection string to an Azure Storage account.
container_name: The container name for the blob.
Returns
-------
blob_names: list of blob names in the container
"""
campaign_container = ContainerClient.from_connection_string(
conn_str=conn_str, container_name=container_name
)
blob_names = [blob.name for blob in campaign_container.list_blobs()]
return blob_names
- Why do we need to hardcode the download directory to `/tmp` in https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/blob_ops.py#L43-L51 ?
I suggest :
@safe_blob
def download_blob(blob_client: BlobClient, local_path: str = '/tmp') -> str:
""" Download Blob from Azure to local file system
Parameters
----------
blob_client: Azure client to interact with a specific blob
local_path: Local path to download the blob in
Returns
-------
blob_directory: directory where blob content is downloaded
"""
if not os.path.exists(local_path):
os.mkdir(local_path)
blob_name = blob_client.blob_name
blob_directory = os.path.join(local_path, blob_name)
with open(blob_directory, "wb") as blob_folder:
blob_data = blob_client.download_blob()
blob_data.readinto(blob_folder)
logger.debug(f'Blob {blob_name} has been successfully downloaded. \n '
f'Path to local storage : {blob_directory}')
return blob_directory
## On AI helpers (ai_ops.py)
- In `AIready` you define thee AI url as `'http://aiapiplastico-dev.westeurope.cloudapp.azure.com:5000'` (with port) but then in `get_prediction`, you define it without the port which is hardcoded inside https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/ai_ops.py#L35
- https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/ai_ops.py#L6-L25 use `request.ok` and `request.reason` instead
I'd suggest something like :
def ai_ready(url: str) -> bool:
""" Evaluate whether AI inference service is available
Parameters
----------
url: address of to AI API
Returns
-------
ready: AI Status: True if AI is ready, else False
"""
try:
ai_request = requests.get(url)
logger.info("AI request reason: ", ai_request.reason)
return ai_request.ok # True if AI is ready, False if not.
except requests.exceptions.RequestException:
logger.warning("AI not found, an error has occured") # Probably wrong url or AI server is down.
return False
- When requesting the AI you store the response (that's a json) in a list in `getPrediction` https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/ai_ops.py#L38 and then you go back to the first and only element of that list in `jsonPrediction` https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/ai_ops.py#L48. I suggest `getPrediction` returns the json as is 🙂.
- Why `# removing 2 x first and 3 last characters of pred` ? 🤔 OHH I understand what you're doing. The AI sends a json formated string, so there may be some characters around like `b'{"detected_trash":[],"fps":4,"video_id":"Users_raph_Desktop_Screenshot_2020-04-29_at_15.27.16.png","video_length":1}\n'` and you're trying to remove the `b` and the `\n`... But there's a function to deserialise a JSON and you don't need to do these tricks: `json.dumps(ai_response)` does the work. I suggest we get rid of this function and deserialise the ai response in `getPrediction`. Agreed ?
- Also, the only thing you use from the AI seems to be 'detected_trash', so function `getPrediction` could do the all work and return 'detected_trash' for conveniency purpose.
- https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/ai_ops.py#L53-L59 Do we really need a function for that ? I suggest one simple line : `prediction.get('label')` that is easier to read. Furthermore, if prediction (for some reason ? ) has no key `label` the `get` will return `None` whereas the `prediction['label']` will break.
- https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/ai_ops.py#L81-L84 If deprecated, I suggest we remove it.
- Same for this one: https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/ai_ops.py#L81-L84 I cannot find any usage.
## On Postgre helpers (pg_ops)
- You're using function `pgConnectionString` only once in `pgOpenConnection`. I suggest we simplify the script by calling the first directly in the second.
- `pgCloseConnection` only does `connection.close()` and if it fails it will raise a message. Do we really need a function for that ? I think it's adding some complexity..
## PEP 8 style recommendations
**PEP8 style recommendations [here](https://www.python.org/dev/peps/pep-0008)**🕵️♀️
If you're using a IDE, you probably have warnings on the right side and suggestions such as :
<img width="983" alt="Screenshot 2020-04-29 at 15 27 16" src="https://user-images.githubusercontent.com/22340670/80608404-0205f280-8a37-11ea-810e-e74b0bd8fa6a.png">
A very cool package to clean-up the code I discovered recently is called [black](https://github.com/psf/black).
- [PEP8] You have a lot of dead imports, they should be grouped in the following order:
1. Standard library imports.
2. Related third party imports.
3. Local application/library specific imports.
You should put a blank line between each group of imports.
For example instead of https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L1-L30
You'll have:
import os
import warnings
import gpxpy
import gpxpy.gpx
from azure.storage.blob import BlobClient
from tqdm import tqdm
from .ai_ops import AIready, getPrediction, jsonPrediction, getTrashLabel
from .blob_ops import blobInContainer, blobInfos, downloadBlob
from .gps_ops import goproToGPX, gpsPointList, getDuration, fillGPS, longLat2shapeList, gps2154
from .postgre_ops import pgConnectionString, pgOpenConnection, pgCloseConnection, mapLabel2TrashIdPG, trashGPS, \
trashInsert
- [PEP8] https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L56 to be replaced by `if isAIready:`
- [PEP8] Triple double quote should be used for docstring
- [PEP8] Function name should be lowercase, see [PEP8](https://www.python.org/dev/peps/pep-0008/#method-names-and-instance-variables) recommendations.
- [PEP8] docstring should use a standard format (eg. numpy, google, or ..)
For example https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/ai_ops.py#L6-L11 would become :
def ai_ready(url: str)-> bool:
""" Evaluate whether AI inference service is available
Parameters
----------
url: address of to AI API
Returns
-------
ready: AI Status: True if AI is ready, else False
"""
thank you so much for taking the time to make this review, this will help improving the overall ETL.
I'll try to share my feedback as soon as I can on your different proposal.
General comments
libmediainfo
should be installed. I'll do a PR for that :)tmp
? See examples : https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/gps_ops.py#L30 https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L75 I'd suggest either to set the data directory as an environment variable or to specify the path in the function. Default could be/tmp
if you wish. I understand that the reason is probably the fact that in the future (in real production env), the video will indeed be download in that tmp storage, but for someone who launch the script locally, it would be nice to be able to link to a local folder as well.print
, you can use logs, so that you can specify the level of your message: debug, info, warning, error. For example, instead of : https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L119-L120It may be useful if we need to look into the logs (let's say there's a bug in prod env and we need to fix, we only have logs to see what failed), we can filter them and look for the errors.
On the script itself (etlworkflow_with_params.py)
connection_string
inblob_connection_string
so that we don't mix up with db connection string :)campaign0
is never used in the script. Either it's an error or it's old code.ai_url = os.getenv("AI_URL")
video_xx
tomedia_xx
, so that it works for image as well ?file_path
https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L44 ? I suggest we call it `https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L111 https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L119 https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L123 You define the variable
trashID but then re-get with
prediction['id']`.timestampDelta
is only used for a print ? If the purpose is to check id the video duration and the GPX duration are closed, we should rather test it and raise a warning if it's not the case.os.path.join
for windows people !math.ceil
instead !On GPS helpers (gps_ops.py)
[major]
goproToGPX
is defined twice https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/gps_ops.py#L15-L37 Is it here that we're supposed to use @rcourivaud 's package gopro2gpx ? I suggest we use his package and so, function becomes :file_path
toblob_video_name
? https://github.com/surfriderfoundationeurope/etl/blob/01209895e4b473630a41901bdd9fe40ae24be46a/scripts/etlworkflow_with_params.py#L44 I suggest we go formedia_name
from the beginning if you agree :)In
getMediaInfo
you call a shell script, https://github.com/surfriderfoundationeurope/etl/blob/5475dd8de8523c4ae1e7296feb75b40ff0c25fc5/scripts/gps_ops.py#L62-L63 we could instead use a python wrapper package pymediainfo or also packagemoviepy
. But as far as I see, you're only using these media to access the duration. What else do you need? As reported here , I suggestfillGPS
https://github.com/surfriderfoundationeurope/etl/blob/5475dd8de8523c4ae1e7296feb75b40ff0c25fc5/scripts/gps_ops.py#L121-L128If I understand right, your final goal is to convert gps format to 2154 geometry, given a list of coordinates. So, I suggest to simplify the 4 functions
longLat2shapePoint
,longLat2shapeList
,geometryTransfo
andgps2154
into the following:def add_geom_to_gps_points(gps_points: list, **kwargs): """ Add projection to a given geometry to a GPS list of points
def safe_blob(func): @functools.wraps(func) def wrapper(*args, *kwargs): try: print('calling', func, 'with', args, kwargs) return func(args, **kwargs) except ValueError: logger.warning('Could not connect to blob storage. ' 'Connection string is either blank or malformed.' 'Early exit of ETL process !') exit()
@safe_blob def list_container_blob_names(conn_str: str, container_name: str) -> list: """ List blob names in a given container
@safe_blob def download_blob(blob_client: BlobClient, local_path: str = '/tmp') -> str: """ Download Blob from Azure to local file system
def ai_ready(url: str) -> bool: """ Evaluate whether AI inference service is available
import os import warnings
import gpxpy import gpxpy.gpx from azure.storage.blob import BlobClient from tqdm import tqdm
from .ai_ops import AIready, getPrediction, jsonPrediction, getTrashLabel from .blob_ops import blobInContainer, blobInfos, downloadBlob from .gps_ops import goproToGPX, gpsPointList, getDuration, fillGPS, longLat2shapeList, gps2154 from .postgre_ops import pgConnectionString, pgOpenConnection, pgCloseConnection, mapLabel2TrashIdPG, trashGPS, \ trashInsert
def ai_ready(url: str)-> bool: """ Evaluate whether AI inference service is available