heartwood every commit a ring

Add crawler insights

3aca9d23 by Isaac Bythewood · 3 years ago

modified .gitignore
@@ -7,3 +7,4 @@ db.sqlite3-journalmedianode_modulesstaticcrawler_output
modified Makefile
@@ -14,7 +14,7 @@ PROJECT_NAME = $(shell basename $(PWD))run: check install	@echo "run ----------------------------------------------------------------"	${MAKE} -j2 runserver webpack scheduler	${MAKE} -j3 runserver webpack schedulerrunserver:	pipenv run python manage.py runserver
modified Pipfile
@@ -10,6 +10,7 @@ requests = "*"tzdata = "*"  # Fixes "zoneinfo._common.ZoneInfoNotFoundError" on docker serveruvicorn = "*"whitenoise = "*"scrapy = "*"[dev-packages]black = "*"
modified Pipfile.lock
@@ -1,7 +1,7 @@{    "_meta": {        "hash": {            "sha256": "6644f4a3e9cf3514f8f1dd9ac434ca925089c5f483ee712fcd77c9dafed9766d"            "sha256": "a7cb1fab6df914ce6e0575d9b8ef99d4dcb8e591e6a44d31c3651d26706990ad"        },        "pipfile-spec": 6,        "requires": {
@@ -24,6 +24,21 @@            "markers": "python_version >= '3.7'",            "version": "==3.5.2"        },        "attrs": {            "hashes": [                "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",                "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"            ],            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",            "version": "==21.4.0"        },        "automat": {            "hashes": [                "sha256:7979803c74610e11ef0c0d68a2942b152df52da55336e0c9d58daf1831cbdf33",                "sha256:b6feb6455337df834f6c9962d6ccf771515b7d939bca142b29c20c2376bc6111"            ],            "version": "==20.2.0"        },        "certifi": {            "hashes": [                "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d",
@@ -32,6 +47,75 @@            "markers": "python_version >= '3.6'",            "version": "==2022.6.15"        },        "cffi": {            "hashes": [                "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5",                "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef",                "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104",                "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426",                "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405",                "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375",                "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a",                "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e",                "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc",                "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf",                "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185",                "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497",                "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3",                "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35",                "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c",                "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83",                "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21",                "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca",                "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984",                "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac",                "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd",                "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee",                "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a",                "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2",                "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192",                "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7",                "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585",                "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f",                "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e",                "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27",                "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b",                "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e",                "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e",                "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d",                "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c",                "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415",                "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82",                "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02",                "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314",                "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325",                "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c",                "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3",                "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914",                "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045",                "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d",                "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9",                "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5",                "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2",                "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c",                "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3",                "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2",                "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8",                "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d",                "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d",                "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9",                "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162",                "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76",                "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4",                "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e",                "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9",                "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6",                "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b",                "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01",                "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"            ],            "version": "==1.15.1"        },        "charset-normalizer": {            "hashes": [                "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5",
@@ -48,6 +132,49 @@            "markers": "python_version >= '3.7'",            "version": "==8.1.3"        },        "constantly": {            "hashes": [                "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35",                "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d"            ],            "version": "==15.1.0"        },        "cryptography": {            "hashes": [                "sha256:190f82f3e87033821828f60787cfa42bff98404483577b591429ed99bed39d59",                "sha256:2be53f9f5505673eeda5f2736bea736c40f051a739bfae2f92d18aed1eb54596",                "sha256:30788e070800fec9bbcf9faa71ea6d8068f5136f60029759fd8c3efec3c9dcb3",                "sha256:3d41b965b3380f10e4611dbae366f6dc3cefc7c9ac4e8842a806b9672ae9add5",                "sha256:4c590ec31550a724ef893c50f9a97a0c14e9c851c85621c5650d699a7b88f7ab",                "sha256:549153378611c0cca1042f20fd9c5030d37a72f634c9326e225c9f666d472884",                "sha256:63f9c17c0e2474ccbebc9302ce2f07b55b3b3fcb211ded18a42d5764f5c10a82",                "sha256:6bc95ed67b6741b2607298f9ea4932ff157e570ef456ef7ff0ef4884a134cc4b",                "sha256:7099a8d55cd49b737ffc99c17de504f2257e3787e02abe6d1a6d136574873441",                "sha256:75976c217f10d48a8b5a8de3d70c454c249e4b91851f6838a4e48b8f41eb71aa",                "sha256:7bc997818309f56c0038a33b8da5c0bfbb3f1f067f315f9abd6fc07ad359398d",                "sha256:80f49023dd13ba35f7c34072fa17f604d2f19bf0989f292cedf7ab5770b87a0b",                "sha256:91ce48d35f4e3d3f1d83e29ef4a9267246e6a3be51864a5b7d2247d5086fa99a",                "sha256:a958c52505c8adf0d3822703078580d2c0456dd1d27fabfb6f76fe63d2971cd6",                "sha256:b62439d7cd1222f3da897e9a9fe53bbf5c104fff4d60893ad1355d4c14a24157",                "sha256:b7f8dd0d4c1f21759695c05a5ec8536c12f31611541f8904083f3dc582604280",                "sha256:d204833f3c8a33bbe11eda63a54b1aad7aa7456ed769a982f21ec599ba5fa282",                "sha256:e007f052ed10cc316df59bc90fbb7ff7950d7e2919c9757fd42a2b8ecf8a5f67",                "sha256:f2dcb0b3b63afb6df7fd94ec6fbddac81b5492513f7b0436210d390c14d46ee8",                "sha256:f721d1885ecae9078c3f6bbe8a88bc0786b6e749bf32ccec1ef2b18929a05046",                "sha256:f7a6de3e98771e183645181b3627e2563dcde3ce94a9e42a3f427d2255190327",                "sha256:f8c0a6e9e1dd3eb0414ba320f85da6b0dcbd543126e30fcc546e7372a7fbf3b9"            ],            "markers": "python_version >= '3.6'",            "version": "==37.0.4"        },        "cssselect": {            "hashes": [                "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",                "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"            ],            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",            "version": "==1.1.0"        },        "django": {            "hashes": [                "sha256:a67a793ff6827fd373555537dca0da293a63a316fe34cb7f367f898ccca3c3ae",
@@ -56,6 +183,14 @@            "index": "pypi",            "version": "==4.0.6"        },        "filelock": {            "hashes": [                "sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404",                "sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04"            ],            "markers": "python_version >= '3.7'",            "version": "==3.7.1"        },        "gunicorn": {            "hashes": [                "sha256:9dcc4547dbb1cb284accfb15ab5667a0e5d1881cc443e0677b4882a4067a807e",
@@ -72,6 +207,13 @@            "markers": "python_version >= '3.6'",            "version": "==0.13.0"        },        "hyperlink": {            "hashes": [                "sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b",                "sha256:e6b14c37ecb73e89c77d78cdb4c2cc8f3fb59a885c5b3f819ff4ed80f25af1b4"            ],            "version": "==21.0.0"        },        "idna": {            "hashes": [                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
@@ -80,6 +222,195 @@            "markers": "python_version >= '3.5'",            "version": "==3.3"        },        "incremental": {            "hashes": [                "sha256:02f5de5aff48f6b9f665d99d48bfc7ec03b6e3943210de7cfc88856d755d6f57",                "sha256:92014aebc6a20b78a8084cdd5645eeaa7f74b8933f70fa3ada2cfbd1e3b54321"            ],            "version": "==21.3.0"        },        "itemadapter": {            "hashes": [                "sha256:3f1f60ebd6c91b00222820f38bf5126aa49a8fb6e467d351f0364421f953a15d",                "sha256:6641a97a52a3e60eb7b0e626010af6edcb6af834b32ce129992150dec266c916"            ],            "markers": "python_version >= '3.6'",            "version": "==0.6.0"        },        "itemloaders": {            "hashes": [                "sha256:1277cd8ca3e4c02dcdfbc1bcae9134ad89acfa6041bd15b4561c6290203a0c96",                "sha256:4cb46a0f8915e910c770242ae3b60b1149913ed37162804f1e40e8535d6ec497"            ],            "markers": "python_version >= '3.6'",            "version": "==1.0.4"        },        "jmespath": {            "hashes": [                "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980",                "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"            ],            "markers": "python_version >= '3.7'",            "version": "==1.0.1"        },        "lxml": {            "hashes": [                "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318",                "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c",                "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b",                "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000",                "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73",                "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d",                "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb",                "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8",                "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2",                "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345",                "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94",                "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e",                "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b",                "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc",                "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a",                "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9",                "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc",                "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387",                "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb",                "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7",                "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4",                "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97",                "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67",                "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627",                "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7",                "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd",                "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3",                "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7",                "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130",                "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b",                "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036",                "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785",                "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca",                "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91",                "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc",                "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536",                "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391",                "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3",                "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d",                "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21",                "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3",                "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d",                "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29",                "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715",                "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed",                "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25",                "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c",                "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785",                "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837",                "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4",                "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b",                "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2",                "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067",                "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448",                "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d",                "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2",                "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc",                "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c",                "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5",                "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84",                "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8",                "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf",                "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7",                "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e",                "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb",                "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b",                "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3",                "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad",                "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8",                "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"            ],            "markers": "platform_python_implementation == 'CPython'",            "version": "==4.9.1"        },        "parsel": {            "hashes": [                "sha256:70efef0b651a996cceebc69e55a85eb2233be0890959203ba7c3a03c72725c79",                "sha256:9e1fa8db1c0b4a878bf34b35c043d89c9d1cbebc23b4d34dbc3c0ec33f2e087d"            ],            "version": "==1.6.0"        },        "protego": {            "hashes": [                "sha256:04419b18f20e8909f1691c6b678392988271cc2a324a72f9663cb3af838b4bf7",                "sha256:df666d4304dab774e2dc9feb208bb1ac8d71ea5ceec12f4c99eba30fbd642ff2"            ],            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",            "version": "==0.2.1"        },        "pyasn1": {            "hashes": [                "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",                "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576",                "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf",                "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7",                "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d",                "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00",                "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8",                "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86",                "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12",                "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776",                "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba",                "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2",                "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"            ],            "version": "==0.4.8"        },        "pyasn1-modules": {            "hashes": [                "sha256:0845a5582f6a02bb3e1bde9ecfc4bfcae6ec3210dd270522fee602365430c3f8",                "sha256:0fe1b68d1e486a1ed5473f1302bd991c1611d319bba158e98b106ff86e1d7199",                "sha256:15b7c67fabc7fc240d87fb9aabf999cf82311a6d6fb2c70d00d3d0604878c811",                "sha256:426edb7a5e8879f1ec54a1864f16b882c2837bfd06eee62f2c982315ee2473ed",                "sha256:65cebbaffc913f4fe9e4808735c95ea22d7a7775646ab690518c056784bc21b4",                "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e",                "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74",                "sha256:a99324196732f53093a84c4369c996713eb8c89d360a496b599fb1a9c47fc3eb",                "sha256:b80486a6c77252ea3a3e9b1e360bc9cf28eaac41263d173c032581ad2f20fe45",                "sha256:c29a5e5cc7a3f05926aff34e097e84f8589cd790ce0ed41b67aed6857b26aafd",                "sha256:cbac4bc38d117f2a49aeedec4407d23e8866ea4ac27ff2cf7fb3e5b570df19e0",                "sha256:f39edd8c4ecaa4556e989147ebf219227e2cd2e8a43c7e7fcb1f1c18c5fd6a3d",                "sha256:fe0644d9ab041506b62782e92b06b8c68cca799e1a9636ec398675459e031405"            ],            "version": "==0.2.8"        },        "pycparser": {            "hashes": [                "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9",                "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"            ],            "version": "==2.21"        },        "pydispatcher": {            "hashes": [                "sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf",                "sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433"            ],            "markers": "platform_python_implementation == 'CPython'",            "version": "==2.0.5"        },        "pyopenssl": {            "hashes": [                "sha256:660b1b1425aac4a1bea1d94168a85d99f0b3144c869dd4390d27629d0087f1bf",                "sha256:ea252b38c87425b64116f808355e8da644ef9b07e429398bfece610f893ee2e0"            ],            "markers": "python_version >= '3.6'",            "version": "==22.0.0"        },        "queuelib": {            "hashes": [                "sha256:4b207267f2642a8699a1f806045c56eb7ad1a85a10c0e249884580d139c2fcd2",                "sha256:4b96d48f650a814c6fb2fd11b968f9c46178b683aad96d68f930fe13a8574d19"            ],            "markers": "python_version >= '3.5'",            "version": "==1.6.2"        },        "requests": {            "hashes": [                "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
@@ -88,6 +419,28 @@            "index": "pypi",            "version": "==2.28.1"        },        "requests-file": {            "hashes": [                "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e",                "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"            ],            "version": "==1.5.1"        },        "scrapy": {            "hashes": [                "sha256:56fd55a59d0f329ce752892358abee5a6b50b4fc55a40420ea317dc617553827",                "sha256:e977f57d4e828f25d2702e8c2212e8abcd7d6ce45be560f1830a39432a494c09"            ],            "index": "pypi",            "version": "==2.6.1"        },        "service-identity": {            "hashes": [                "sha256:6e6c6086ca271dc11b033d17c3a8bea9f24ebff920c587da090afc9519419d34",                "sha256:f0b0caac3d40627c3c04d7a51b6e06721857a0e10a8775f2d1d7e72901b3a7db"            ],            "version": "==21.1.0"        },        "setuptools": {            "hashes": [                "sha256:0d33c374d41c7863419fc8f6c10bfe25b7b498aa34164d135c622e52580c6b16",
@@ -96,6 +449,14 @@            "markers": "python_version >= '3.7'",            "version": "==63.2.0"        },        "six": {            "hashes": [                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"            ],            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",            "version": "==1.16.0"        },        "sqlparse": {            "hashes": [                "sha256:0c00730c74263a94e5a9919ade150dfc3b19c574389985446148402998287dae",
@@ -104,6 +465,30 @@            "markers": "python_version >= '3.5'",            "version": "==0.4.2"        },        "tldextract": {            "hashes": [                "sha256:35a0260570e214d8d3cfeeb403992fe9e2b686925f63c9b03c5933408ac2aa5a",                "sha256:fe15ac3205e5a25b61689369f98cb45c7778a8f2af113d7c11559ece5195f2d6"            ],            "markers": "python_version >= '3.7'",            "version": "==3.3.1"        },        "twisted": {            "hashes": [                "sha256:a047990f57dfae1e0bd2b7df2526d4f16dcdc843774dc108b78c52f2a5f13680",                "sha256:f9f7a91f94932477a9fc3b169d57f54f96c6e74a23d78d9ce54039a7f48928a2"            ],            "markers": "python_full_version >= '3.6.7'",            "version": "==22.4.0"        },        "typing-extensions": {            "hashes": [                "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02",                "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"            ],            "markers": "python_version >= '3.7'",            "version": "==4.3.0"        },        "tzdata": {            "hashes": [                "sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9",
@@ -128,6 +513,13 @@            "index": "pypi",            "version": "==0.18.2"        },        "w3lib": {            "hashes": [                "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53",                "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df"            ],            "version": "==1.22.0"        },        "whitenoise": {            "hashes": [                "sha256:8e9c600a5c18bd17655ef668ad55b5edf6c24ce9bdca5bf607649ca4b1e8e2c2",
@@ -135,6 +527,63 @@            ],            "index": "pypi",            "version": "==6.2.0"        },        "zope.interface": {            "hashes": [                "sha256:08f9636e99a9d5410181ba0729e0408d3d8748026ea938f3b970a0249daa8192",                "sha256:0b465ae0962d49c68aa9733ba92a001b2a0933c317780435f00be7ecb959c702",                "sha256:0cba8477e300d64a11a9789ed40ee8932b59f9ee05f85276dbb4b59acee5dd09",                "sha256:0cee5187b60ed26d56eb2960136288ce91bcf61e2a9405660d271d1f122a69a4",                "sha256:0ea1d73b7c9dcbc5080bb8aaffb776f1c68e807767069b9ccdd06f27a161914a",                "sha256:0f91b5b948686659a8e28b728ff5e74b1be6bf40cb04704453617e5f1e945ef3",                "sha256:15e7d1f7a6ee16572e21e3576d2012b2778cbacf75eb4b7400be37455f5ca8bf",                "sha256:17776ecd3a1fdd2b2cd5373e5ef8b307162f581c693575ec62e7c5399d80794c",                "sha256:194d0bcb1374ac3e1e023961610dc8f2c78a0f5f634d0c737691e215569e640d",                "sha256:1c0e316c9add0db48a5b703833881351444398b04111188069a26a61cfb4df78",                "sha256:205e40ccde0f37496904572035deea747390a8b7dc65146d30b96e2dd1359a83",                "sha256:273f158fabc5ea33cbc936da0ab3d4ba80ede5351babc4f577d768e057651531",                "sha256:2876246527c91e101184f63ccd1d716ec9c46519cc5f3d5375a3351c46467c46",                "sha256:2c98384b254b37ce50eddd55db8d381a5c53b4c10ee66e1e7fe749824f894021",                "sha256:2e5a26f16503be6c826abca904e45f1a44ff275fdb7e9d1b75c10671c26f8b94",                "sha256:334701327f37c47fa628fc8b8d28c7d7730ce7daaf4bda1efb741679c2b087fc",                "sha256:3748fac0d0f6a304e674955ab1365d515993b3a0a865e16a11ec9d86fb307f63",                "sha256:3c02411a3b62668200910090a0dff17c0b25aaa36145082a5a6adf08fa281e54",                "sha256:3dd4952748521205697bc2802e4afac5ed4b02909bb799ba1fe239f77fd4e117",                "sha256:3f24df7124c323fceb53ff6168da70dbfbae1442b4f3da439cd441681f54fe25",                "sha256:469e2407e0fe9880ac690a3666f03eb4c3c444411a5a5fddfdabc5d184a79f05",                "sha256:4de4bc9b6d35c5af65b454d3e9bc98c50eb3960d5a3762c9438df57427134b8e",                "sha256:5208ebd5152e040640518a77827bdfcc73773a15a33d6644015b763b9c9febc1",                "sha256:52de7fc6c21b419078008f697fd4103dbc763288b1406b4562554bd47514c004",                "sha256:5bb3489b4558e49ad2c5118137cfeaf59434f9737fa9c5deefc72d22c23822e2",                "sha256:5dba5f530fec3f0988d83b78cc591b58c0b6eb8431a85edd1569a0539a8a5a0e",                "sha256:5dd9ca406499444f4c8299f803d4a14edf7890ecc595c8b1c7115c2342cadc5f",                "sha256:5f931a1c21dfa7a9c573ec1f50a31135ccce84e32507c54e1ea404894c5eb96f",                "sha256:63b82bb63de7c821428d513607e84c6d97d58afd1fe2eb645030bdc185440120",                "sha256:66c0061c91b3b9cf542131148ef7ecbecb2690d48d1612ec386de9d36766058f",                "sha256:6f0c02cbb9691b7c91d5009108f975f8ffeab5dff8f26d62e21c493060eff2a1",                "sha256:71aace0c42d53abe6fc7f726c5d3b60d90f3c5c055a447950ad6ea9cec2e37d9",                "sha256:7d97a4306898b05404a0dcdc32d9709b7d8832c0c542b861d9a826301719794e",                "sha256:7df1e1c05304f26faa49fa752a8c690126cf98b40b91d54e6e9cc3b7d6ffe8b7",                "sha256:8270252effc60b9642b423189a2fe90eb6b59e87cbee54549db3f5562ff8d1b8",                "sha256:867a5ad16892bf20e6c4ea2aab1971f45645ff3102ad29bd84c86027fa99997b",                "sha256:877473e675fdcc113c138813a5dd440da0769a2d81f4d86614e5d62b69497155",                "sha256:8892f89999ffd992208754851e5a052f6b5db70a1e3f7d54b17c5211e37a98c7",                "sha256:9a9845c4c6bb56e508651f005c4aeb0404e518c6f000d5a1123ab077ab769f5c",                "sha256:a1e6e96217a0f72e2b8629e271e1b280c6fa3fe6e59fa8f6701bec14e3354325",                "sha256:a8156e6a7f5e2a0ff0c5b21d6bcb45145efece1909efcbbbf48c56f8da68221d",                "sha256:a9506a7e80bcf6eacfff7f804c0ad5350c8c95b9010e4356a4b36f5322f09abb",                "sha256:af310ec8335016b5e52cae60cda4a4f2a60a788cbb949a4fbea13d441aa5a09e",                "sha256:b0297b1e05fd128d26cc2460c810d42e205d16d76799526dfa8c8ccd50e74959",                "sha256:bf68f4b2b6683e52bec69273562df15af352e5ed25d1b6641e7efddc5951d1a7",                "sha256:d0c1bc2fa9a7285719e5678584f6b92572a5b639d0e471bb8d4b650a1a910920",                "sha256:d4d9d6c1a455d4babd320203b918ccc7fcbefe308615c521062bc2ba1aa4d26e",                "sha256:db1fa631737dab9fa0b37f3979d8d2631e348c3b4e8325d6873c2541d0ae5a48",                "sha256:dd93ea5c0c7f3e25335ab7d22a507b1dc43976e1345508f845efc573d3d779d8",                "sha256:f44e517131a98f7a76696a7b21b164bcb85291cee106a23beccce454e1f433a4",                "sha256:f7ee479e96f7ee350db1cf24afa5685a5899e2b34992fb99e1f7c1b0b758d263"            ],            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",            "version": "==5.4.0"        }    },    "develop": {
added crawler/__init__.py
added crawler/items.py
@@ -0,0 +1,5 @@import scrapyclass CrawlerItem(scrapy.Item):    pass
added crawler/middlewares.py
@@ -0,0 +1,97 @@from scrapy import signalsfrom itemadapter import is_item, ItemAdapterclass CrawlerSpiderMiddleware:    # Not all methods need to be defined. If a method is not defined,    # scrapy acts as if the spider middleware does not modify the    # passed objects.    @classmethod    def from_crawler(cls, crawler):        # This method is used by Scrapy to create your spiders.        s = cls()        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)        return s    def process_spider_input(self, response, spider):        # Called for each response that goes through the spider        # middleware and into the spider.        # Should return None or raise an exception.        return None    def process_spider_output(self, response, result, spider):        # Called with the results returned from the Spider, after        # it has processed the response.        # Must return an iterable of Request, or item objects.        for i in result:            yield i    def process_spider_exception(self, response, exception, spider):        # Called when a spider or process_spider_input() method        # (from other spider middleware) raises an exception.        # Should return either None or an iterable of Request or item objects.        pass    def process_start_requests(self, start_requests, spider):        # Called with the start requests of the spider, and works        # similarly to the process_spider_output() method, except        # that it doesn’t have a response associated.        # Must return only requests (not items).        for r in start_requests:            yield r    def spider_opened(self, spider):        spider.logger.info('Spider opened: %s' % spider.name)class CrawlerDownloaderMiddleware:    # Not all methods need to be defined. If a method is not defined,    # scrapy acts as if the downloader middleware does not modify the    # passed objects.    @classmethod    def from_crawler(cls, crawler):        # This method is used by Scrapy to create your spiders.        s = cls()        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)        return s    def process_request(self, request, spider):        # Called for each request that goes through the downloader        # middleware.        # Must either:        # - return None: continue processing this request        # - or return a Response object        # - or return a Request object        # - or raise IgnoreRequest: process_exception() methods of        #   installed downloader middleware will be called        return None    def process_response(self, request, response, spider):        # Called with the response returned from the downloader.        # Must either;        # - return a Response object        # - return a Request object        # - or raise IgnoreRequest        return response    def process_exception(self, request, exception, spider):        # Called when a download handler or a process_request()        # (from other downloader middleware) raises an exception.        # Must either:        # - return None: continue processing this exception        # - return a Response object: stops process_exception() chain        # - return a Request object: stops process_exception() chain        pass    def spider_opened(self, spider):        spider.logger.info('Spider opened: %s' % spider.name)
added crawler/pipelines.py
@@ -0,0 +1,6 @@from itemadapter import ItemAdapterclass CrawlerPipeline:    def process_item(self, item, spider):        return item
added crawler/runner.py
@@ -0,0 +1,38 @@import subprocessimport osfrom django.conf import settingsdef run_seo_spider(url):    """    Run the SEO spider on the given URL.    I'm using the command line runner instead of the python runner because the    python runner doesn't play well with threads and I can run multiple spiders    with the command line runner.    """    filename = url.split('/')[2] + '.json'    if settings.DEBUG:        filename = 'crawler_output/' + filename    else:        filename = '/data/crawler_output/' + filename    # remove the file if it exists before running the spider    if os.path.exists(filename):        os.remove(filename)    # use the jsonlines format to store the results    subprocess.run([        'pipenv',        'run',        'scrapy',        'crawl',        'seo_spider',        '-a',        'url=' + url,        '-t',        'jsonlines',        '-o',        filename,    ])
added crawler/settings.py
@@ -0,0 +1,6 @@BOT_NAME = 'status'SPIDER_MODULES = ['crawler.spiders']NEWSPIDER_MODULE = 'crawler.spiders'USER_AGENT = 'status (+https://status.bythewood.me)'ROBOTSTXT_OBEY = TrueCONCURRENT_REQUESTS = 2
added crawler/spiders/__init__.py
@@ -0,0 +1,4 @@# This package will contain the spiders of your Scrapy project## Please refer to the documentation for information on how to create and manage# your spiders.
added crawler/spiders/seo_spider.py
@@ -0,0 +1,96 @@from scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import LinkExtractorclass SEOSpider(CrawlSpider):    name = 'seo_spider'    handle_httpstatus_list = [200, 301, 302, 303, 307, 400, 401, 403, 404, 500]    rules = (        # Rule(        #     LinkExtractor(        #         allow_domains=config['allow_domains'],        #     ),        #     callback='parse_local',        #     follow=True,        # ),        # Rule(        #     LinkExtractor(),        #     callback='parse_external',        # ),        Rule(            LinkExtractor(),            callback='parse_local',            follow=True,        ),    )    def __init__(self, url, *args, **kwargs):        super().__init__(*args, **kwargs)        self.start_urls = [url]        self.allowed_domains = [url.split('/')[2]]    def parse_local(self, response):        content_type = response.headers.get('Content-Type', b'').decode("utf-8")        if response.status != 200:            return {                'url': response.url,                'status': response.status,                'type': 'local',                'content_type': content_type,                'title': '',                'description': '',                'canonical': '',                'og_title': '',                'og_description': '',                'og_image': '',                'og_url': '',                'h1': '',            }        if "text/html" not in content_type:            return {                'url': response.url,                'status': response.status,                'type': 'local',                'content_type': content_type,                'title': '',                'description': '',                'canonical': '',                'og_title': '',                'og_description': '',                'og_image': '',                'og_url': '',                'h1': '',            }        return {            'url': response.url,            'status': response.status,            'type': 'local',            'content_type': content_type,            'title': response.xpath('normalize-space(//title)').get(),            'description': response.xpath('normalize-space(//meta[@name="description"]/@content)').get(),            'canonical': response.xpath('normalize-space(//link[@rel="canonical"]/@href)').get(),            'og_title': response.xpath('normalize-space(//meta[@property="og:title"]/@content)').get(),            'og_description': response.xpath('normalize-space(//meta[@property="og:description"]/@content)').get(),            'og_image': response.xpath('normalize-space(//meta[@property="og:image"]/@content)').get(),            'og_url': response.xpath('normalize-space(//meta[@property="og:url"]/@content)').get(),            'h1': response.xpath('normalize-space(//h1)').get(),        }    # def parse_external(self, response):    #     if response.status != 200:    #         return {    #             'url': response.url,    #             'status': response.status,    #             'type': 'external',    #         }    #     return {    #         'url': response.url,    #         'status': response.status,    #         'type': 'external',    #         'content_type': response.headers.get('Content-Type', b'').decode("utf-8"),    #     }
modified properties/admin.py
@@ -11,7 +11,6 @@ class PropertyAdmin(admin.ModelAdmin):        "total_checks",        "last_run_at",        "next_run_at",        "run_interval",        "should_check",    )    list_filter = ("user__username",)
modified properties/management/commands/scheduler.py
@@ -10,6 +10,7 @@ from properties.models import Property, Checkq = queue.Queue()q_status = queue.Queue()class Command(BaseCommand):
@@ -31,24 +32,27 @@ class Command(BaseCommand):        self.stdout.write("[Scheduler] Checking lighthouse {}".format(property.url))        property.process_check_lighthouse()    def queue_add(self, property_id, lighthouse=False):        q.put((property_id, lighthouse))    def thread_target_crawler(self, property_id):        property = Property.objects.get(id=property_id)        self.stdout.write("[Scheduler] Checking crawler {}".format(property.url))        property.crawl_site()    def queue_add(self, property_id, property_type):        q.put((property_id, property_type))    def queue_add_status(self, property_id, property_type):        q_status.put((property_id, property_type))    def queue_process(self):        while True:            """            Use 2 threads to process each item in the queue. If lighthouse == False            then use thread_target in the thread_target function. If lighthouse == True            then use thread_target_lighthouse in the thread_target_lighthouse function.            """            if not q.empty():                threads = []                for i in range(2):                    q_data = q.get()                    if q_data[1]:                    if q_data[1] == "lighthouse":                        t = threading.Thread(target=self.thread_target_lighthouse, args=(q_data[0],))                    else:                        t = threading.Thread(target=self.thread_target, args=(q_data[0],))                    elif q_data[1] == "crawler":                        t = threading.Thread(target=self.thread_target_crawler, args=(q_data[0],))                    t.daemon = True                    t.start()                    threads.append(t)
@@ -57,6 +61,58 @@ class Command(BaseCommand):                    q.task_done()            time.sleep(1)    def queue_process_status(self):        while True:            if not q_status.empty():                threads = []                for i in range(2):                    q_data = q_status.get()                    if q_data[1] == "status":                        t = threading.Thread(target=self.thread_target, args=(q_data[0],))                    t.daemon = True                    t.start()                    threads.append(t)                for t in threads:                    t.join()                    q_status.task_done()            time.sleep(1)    def queue_check_status(self):        properties = [p for p in Property.objects.all() if p.should_check()]        for p in properties:            p.next_run_at = p.get_next_run_at()            p.last_run_at = timezone.now()            p.save(update_fields=["next_run_at", "last_run_at"])        properties = [p.id for p in properties]        db.connections.close_all()        for p_id in properties:            self.queue_add_status(p_id, "status")    def queue_check_lighthouse(self):        properties = [p for p in Property.objects.all() if p.should_check_lighthouse()]        for p in properties:            p.next_lighthouse_run_at = p.get_next_run_at_lighthouse()            p.last_lighthouse_run_at = timezone.now()            p.save(update_fields=["next_lighthouse_run_at", "last_lighthouse_run_at"])        properties = [p.id for p in properties]        db.connections.close_all()        for p_id in properties:            self.queue_add(p_id, "lighthouse")    def queue_check_crawler(self):        properties = [p for p in Property.objects.all() if p.should_check_crawl()]        for p in properties:            p.next_run_at_crawler = p.get_next_run_at_crawl()            p.last_run_at_crawler = timezone.now()            p.save(update_fields=["next_run_at_crawler", "last_run_at_crawler"])        properties = [p.id for p in properties]        db.connections.close_all()        for p_id in properties:            self.queue_add(p_id, "crawler")    def handle(self, *args, **options):        self.stdout.write("[Scheduler] Starting scheduler...")
@@ -65,36 +121,18 @@ class Command(BaseCommand):        t.daemon = True        t.start()        # Start queue_process thread        t = threading.Thread(target=self.queue_process_status)        t.daemon = True        t.start()        # Start our loop to check properties every 30 seconds        while True:            # Do our standard checks            # Only run 10 at a time            properties = [p for p in Property.objects.all() if p.should_check()]            for p in properties:                p.next_run_at = p.get_next_run_at()                p.last_run_at = timezone.now()                p.save(update_fields=["next_run_at", "last_run_at"])            properties = [p.id for p in properties]            db.connections.close_all()            for p_id in properties:                self.queue_add(p_id)            self.queue_check_status()            self.queue_check_lighthouse()            self.queue_check_crawler()            self.clean_checks()            # Do our daily lighthouse checks            # Only run 1 of these checks per loop to avoid overloading the server            properties = [p for p in Property.objects.all() if p.should_check_lighthouse()]            for p in properties:                p.next_lighthouse_run_at = p.get_next_run_at_lighthouse()                p.last_lighthouse_run_at = timezone.now()                p.save(update_fields=["next_lighthouse_run_at", "last_lighthouse_run_at"])            properties = [p.id for p in properties]            db.connections.close_all()            for p_id in properties:                self.queue_add(p_id, True)            self.stdout.write("[Scheduler] Sleeping scheduler for 30 seconds...")            try:                time.sleep(30)
added properties/migrations/0004_remove_property_run_interval.py
@@ -0,0 +1,17 @@# Generated by Django 4.0.6 on 2022-07-23 00:33from django.db import migrationsclass Migration(migrations.Migration):    dependencies = [        ('properties', '0003_property_properties__url_7229e0_idx_and_more'),    ]    operations = [        migrations.RemoveField(            model_name='property',            name='run_interval',        ),    ]
added properties/migrations/0005_property_last_run_at_crawler_and_more.py
@@ -0,0 +1,23 @@# Generated by Django 4.0.6 on 2022-07-23 01:50from django.db import migrations, modelsclass Migration(migrations.Migration):    dependencies = [        ('properties', '0004_remove_property_run_interval'),    ]    operations = [        migrations.AddField(            model_name='property',            name='last_run_at_crawler',            field=models.DateTimeField(blank=True, null=True),        ),        migrations.AddField(            model_name='property',            name='next_run_at_crawler',            field=models.DateTimeField(blank=True, null=True),        ),    ]
added properties/migrations/0006_property_crawler_insights.py
@@ -0,0 +1,18 @@# Generated by Django 4.0.6 on 2022-07-23 02:36from django.db import migrations, modelsclass Migration(migrations.Migration):    dependencies = [        ('properties', '0005_property_last_run_at_crawler_and_more'),    ]    operations = [        migrations.AddField(            model_name='property',            name='crawler_insights',            field=models.JSONField(blank=True, null=True),        ),    ]
modified properties/models.py
@@ -1,5 +1,8 @@import reimport uuidimport osimport jsonimport loggingimport requestsfrom django.contrib.auth import get_user_model
@@ -8,9 +11,15 @@ from django.db import modelsfrom django.template.loader import render_to_stringfrom django.utils import timezonefrom django.utils.functional import cached_propertyfrom django.conf import settingsfrom crawler.runner import run_seo_spiderfrom status.lighthouse import fetch_lighthouse_results, parse_lighthouse_resultslogger = logging.getLogger(__name__)User = get_user_model()
@@ -131,20 +140,165 @@ class AlertsMixin:            self.send_discord_message()class Property(AlertsMixin, SecurityMixin, models.Model):    """    A site that we attach all our status hits to and connect up to a user.    """class CrawlerMixin:    @cached_property    def get_crawl_output(self):        """        This will fetch crawler output in the JSON format from the folders:    RUN_INTERVAL_CHOICES = (        (60, "Every 1 minute"),        (180, "Every 3 minutes"),        (300, "Every 5 minutes"),        (900, "Every 15 minutes"),        (1800, "Every 30 minutes"),        (3600, "Every hour"),    )        - DEBUG == True: crawler_output/        - DEBUG == False: /data/crawler_output/        The filename in the folder is the site URL `self.url.split("/")[2] + ".json"`.        Need to parse every line individually to get the data.        """        if settings.DEBUG:            path = "crawler_output/"        else:            path = "/data/crawler_output/"        try:            with open(os.path.join(path, self.url.split("/")[2] + ".json")) as f:                data = []                for line in f:                    data.append(json.loads(line))                return data        except FileNotFoundError:            return []    def get_next_run_at_crawl(self):        """        Should check daily.        """        return timezone.now() + timezone.timedelta(days=1)    def should_check_crawl(self):        now = timezone.now()        if self.last_run_at_crawler is None:            return True        if self.next_run_at_crawler is None:            return True        return self.next_run_at_crawler <= now    def parse_page(self, page):        insights = []        # Make sure the content type is text/html else skip        if "text/html" not in page.get("content_type", ""):            return insights        # Make sure all pages have a title        if page['title'] == '':            logger.warning(f"Page {page['url']} has no title")            insights.append({                'url': page['url'],                'issue': 'Page has no title',                'type': 'seo',            })        # Make sure pages have a title between 30 and 60 characters        if len(page['title']) < 30 or len(page['title']) > 60:            logger.warning(f"Page {page['url']} has title of length {len(page['title'])}")            insights.append({                'url': page['url'],                'item': page['title'],                'issue': 'Page title is not between 30 and 60 characters',                'type': 'seo',            })        # Make sure pages have a unique title        if page['title'] in [p['title'] for p in self.get_crawl_output if p['url'] != page['url']]:            logger.warning(f"Page {page['url']} has duplicate title")            insights.append({                'url': page['url'],                'item': page['title'],                'issue': 'Page has duplicate title',                'type': 'seo',            })        # Make sure pages have a description        if page['description'] == '':            logger.warning(f"Page {page['url']} has no description")            insights.append({                'url': page['url'],                'issue': 'Page has no description',                'type': 'seo',            })        # Make sure pages have a description between 70 and 160 characters        if len(page['description']) < 70 or len(page['description']) > 160:            logger.warning(f"Page {page['url']} has description of length {len(page['description'])}")            insights.append({                'url': page['url'],                'item': page['description'],                'issue': 'Page description is not between 70 and 160 characters',                'type': 'seo',            })        # Make sure pages have a unique description        if page['description'] in [p['description'] for p in self.get_crawl_output if p['url'] != page['url']]:            logger.warning(f"Page {page['url']} has duplicate description")            insights.append({                'url': page['url'],                'item': page['description'],                'issue': 'Page has duplicate description',                'type': 'seo',            })        # Make sure pages have an h1        if page['h1'] == '':            logger.warning(f"Page {page['url']} has no h1")            insights.append({                'url': page['url'],                'issue': 'Page has no h1',                'type': 'seo',            })        # Make sure pages have an h1 between 20 and 70 characters        if len(page['h1']) < 20 or len(page['h1']) > 70:            logger.warning(f"Page {page['url']} has h1 of length {len(page['h1'])}")            insights.append({                'url': page['url'],                'item': page['h1'],                'issue': 'Page h1 is not between 20 and 70 characters',                'type': 'seo',            })        # Make sure pages have a unique h1        if page['h1'] in [p['h1'] for p in self.get_crawl_output if p['url'] != page['url']]:            logger.warning(f"Page {page['url']} has duplicate h1")            insights.append({                'url': page['url'],                'item': page['h1'],                'issue': 'Page has duplicate h1',                'type': 'seo',            })        # Make sure pages have a canonical url        if page['canonical'] == '':            logger.warning(f"Page {page['url']} has no canonical url")            insights.append({                'url': page['url'],                'issue': 'Page has no canonical url',                'type': 'seo',            })        return insights    def parse_crawl(self):        insights = []        for page in self.get_crawl_output:            insights.extend(self.parse_page(page))        self.crawler_insights = insights        self.save(update_fields=['crawler_insights'])    def crawl_site(self):        run_seo_spider(self.url)        self.parse_crawl()class Property(CrawlerMixin, AlertsMixin, SecurityMixin, models.Model):    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)    user = models.ForeignKey(User, on_delete=models.CASCADE, related_name="properties")
@@ -152,10 +306,13 @@ class Property(AlertsMixin, SecurityMixin, models.Model):    is_public = models.BooleanField(default=False)    run_interval = models.IntegerField(choices=RUN_INTERVAL_CHOICES, default=180)    last_run_at = models.DateTimeField(blank=True, null=True)    next_run_at = models.DateTimeField(blank=True, null=True)    last_run_at_crawler = models.DateTimeField(blank=True, null=True)    next_run_at_crawler = models.DateTimeField(blank=True, null=True)    crawler_insights = models.JSONField(blank=True, null=True)    lighthouse_scores = models.JSONField(blank=True, null=True)    last_lighthouse_run_at = models.DateTimeField(blank=True, null=True)    next_lighthouse_run_at = models.DateTimeField(blank=True, null=True)
@@ -180,35 +337,10 @@ class Property(AlertsMixin, SecurityMixin, models.Model):        return self.url.split("/")[2].replace("www.", "")    def get_next_run_at(self):        """        Returns the next run datetime. Should be in whole increments of the interval.        """        now = timezone.now()        if self.run_interval == 60:            return now.replace(                minute=(now.minute // 1) * 1, second=0, microsecond=0            ) + timezone.timedelta(minutes=1)        elif self.run_interval == 180:            return now.replace(                minute=(now.minute // 3) * 3, second=0, microsecond=0            ) + timezone.timedelta(minutes=3)        elif self.run_interval == 300:            return now.replace(                minute=(now.minute // 5) * 5, second=0, microsecond=0            ) + timezone.timedelta(minutes=5)        elif self.run_interval == 900:            return now.replace(                minute=(now.minute // 15) * 15, second=0, microsecond=0            ) + timezone.timedelta(minutes=15)        elif self.run_interval == 1800:            return now.replace(                minute=(now.minute // 30) * 30, second=0, microsecond=0            ) + timezone.timedelta(minutes=30)        elif self.run_interval == 3600:            today = now.replace(minute=0, second=0, microsecond=0)            return today + timezone.timedelta(hours=1)        else:            raise ValueError("Invalid run interval")        return now.replace(            minute=(now.minute // 3) * 3, second=0, microsecond=0        ) + timezone.timedelta(minutes=3)    def should_check(self):        now = timezone.now()
@@ -312,6 +444,7 @@ class Property(AlertsMixin, SecurityMixin, models.Model):            scores = [score for score in self.lighthouse_scores.values()]            return round(sum(scores) / len(scores))class Check(models.Model):    property = models.ForeignKey(        Property, on_delete=models.CASCADE, related_name="statuses", editable=False
modified properties/templates/properties/properties.html
@@ -141,18 +141,20 @@                <div class="card-text text-truncate small">Current status</div>              </div>            </div>            <div class="col-6 col-md-2 d-flex align-items-center text-white {% if not property.invalid_cert %}bg-success{% else %}bg-danger{% endif %}">              <div class="card-body">                <div class="card-title h4">{% if not property.invalid_cert %}Ok{% else %}Unhealthy{% endif %}</div>                <div class="card-text text-truncate small">Certificate</div>              </div>            </div>            <div class="col-6 col-md-2 d-none d-md-flex d-flex align-items-center text-white {% if not property.has_security_issue %}bg-success{% else %}bg-danger{% endif %}">              <div class="card-body">                <div class="card-title h4">{% if not property.has_security_issue %}Ok{% else %}Failed{% endif %}</div>                <div class="card-text text-truncate small">Security</div>              </div>            </div>            {% with property.crawler_insights|length as insights %}            <div class="col-6 col-md-2 d-flex align-items-center text-white {% if insights > 100 %}bg-danger{% elif insights > 25 %}bg-warning{% else %}bg-success{% endif %}">              <div class="card-body">                <div class="card-title h4">{{ insights }}</div>                <div class="card-text text-truncate small">Crawler issues</div>              </div>            </div>            {% endwith %}            <div class="col-6 col-md-2 d-none d-md-flex d-flex align-items-center text-white {% if property.avg_lighthouse_score < 80 %}bg-warning{% else %}bg-success{% endif %}">              <div class="card-body">                <div class="card-title h4">
modified properties/templates/properties/property.html
@@ -88,25 +88,25 @@<div class="container-fluid">  <div class="row {% if not property.lighthouse_scores %}mb-4{% endif %}">    <div class="col-6 col-md-3 d-flex align-items-center text-white {% if property.current_status == 200 %}bg-success{% else %}bg-danger{% endif %}">      <div class="card-body text-center">      <div class="card-body text-center py-2">        <div class="card-title h4">{% if property.current_status == 200 %}Ok{% else %}Failed{% endif %}</div>        <div class="card-text text-truncate small">Current status</div>      </div>    </div>    <div class="col-6 col-md-3 d-flex align-items-center text-white {% if not property.invalid_cert %}bg-success{% else %}bg-danger{% endif %}">      <div class="card-body text-center">      <div class="card-body text-center py-2">        <div class="card-title h4">{% if not property.invalid_cert %}Ok{% else %}Unhealthy{% endif %}</div>        <div class="card-text text-truncate small">Certificate</div>      </div>    </div>    <div class="col-6 col-md-3 d-flex align-items-center text-white {% if not property.has_security_issue %}bg-success{% else %}bg-danger{% endif %}">      <div class="card-body text-center">      <div class="card-body text-center py-2">        <div class="card-title h4">{% if not property.has_security_issue %}Ok{% else %}Failed{% endif %}</div>        <div class="card-text text-truncate small">Security</div>      </div>    </div>    <div class="col-6 col-md-3 d-flex align-items-center text-white {% if property.avg_response_time > 500 %}bg-danger{% else %}bg-success{% endif %}">      <div class="card-body text-center">      <div class="card-body text-center py-2">        <div class="card-title h4">{% if property.avg_response_time > 500 %}Unhealthy{% else %}Ok{% endif %}</div>        <div class="card-text text-truncate small">Response time</div>      </div>
@@ -119,7 +119,7 @@  <div class="row mb-4">    {% for category, score in property.lighthouse_scores.items %}    <div class="col-6 col-md-3 d-flex align-items-center text-white {% if score >= 80 %}bg-success{% else %}bg-warning{% endif %}">      <div class="card-body text-center">      <div class="card-body text-center py-2">        <div class="card-title h4">{{ score }}%</div>        <div class="card-text text-truncate small">{{ category }}</div>      </div>
@@ -193,4 +193,41 @@    </div>  </div></div>{% if property.crawler_insights %}<div class="container mt-4">  <div class="row bg-dark text-white py-2 rounded-top fw-bolder">    <div class="col-1">      Type    </div>    <div class="col-3">      URL    </div>    <div class="col-3">      Item    </div>    <div class="col-5">      Issue    </div>  </div>  {% for insight in property.crawler_insights|dictsort:"url" %}  <div class="row bg-light py-2 border-bottom">    <div class="col-md-1">      {{ insight.type|upper }}    </div>    <div class="col-md-3" style="word-wrap: break-word;">      <a href="{{ insight.url }}?utm_source=status.bythewood.me&utm_medium=insights" target="_blank">        {{ insight.url }}      </a>    </div>    <div class="col-md-3">      {{ insight.item }}    </div>    <div class="col-md-5">      {{ insight.issue }}    </div>  </div>  {% endfor %}</div>{% endif %}{% endblock %}
added scrapy.cfg
@@ -0,0 +1,2 @@[settings]default = crawler.settings