{ "cells": [ { "cell_type": "markdown", "id": "7af094ba", "metadata": {}, "source": [ "# MCS 275 Spring 2022 Lecture 40" ] }, { "cell_type": "markdown", "id": "c6a86b94", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 19, "id": "535b2603", "metadata": {}, "outputs": [], "source": [ "from urllib.request import urlopen\n", "import bs4" ] }, { "cell_type": "markdown", "id": "3b7d750e", "metadata": {}, "source": [ "## Utility functions" ] }, { "cell_type": "markdown", "id": "0d4f3b35", "metadata": {}, "source": [ "A function to get the content of a web page as a string (basically, what we did manually in Lecture 39)." ] }, { "cell_type": "code", "execution_count": 20, "id": "95e156a6", "metadata": {}, "outputs": [], "source": [ "def urlreadtext(url,*args,**kwargs):\n", " \"\"\"\n", " Retrieve URL `url` and return the response body decoded as a string. If\n", " content-type specifies a charset, use that. Otherwise, attempt UTF-8\n", " decoding. Returns the resulting string.\n", "\n", " Additional arguments are passed to `urllib.request.urlopen`.\n", " \"\"\"\n", " print(\"Opening URL '{}'\".format(url))\n", " with urlopen(url,*args,**kwargs) as res:\n", " # Get raw data (bytes)\n", " data = res.read()\n", " # Determine the encoding\n", " encoding = res.headers.get_content_charset()\n", " if encoding is None:\n", " # Danger: no encoding was specified in the headers\n", " # Try using UTF-8\n", " encoding = \"UTF-8\"\n", "\n", " # TODO: Detect if the response indicates a non-text content type, and if\n", " # so, raise an informative exception rather than just letting the\n", " # attempt to decode as a string fail.\n", "\n", " # Convert to string and return\n", " return data.decode(encoding)\n", "\n", "# Let's apply caching so that we only make one request to any given\n", "# URL, even if `urlreadtext` is called many times\n", "import functools\n", "urlreadtext = functools.lru_cache(maxsize=None)(urlreadtext)" ] }, { "cell_type": "code", "execution_count": 21, "id": "acae7ff7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Opening URL 'http://example.com'\n" ] }, { "data": { "text/plain": [ "'\\n\\n\\n Example Domain\\n\\n \\n \\n \\n \\n\\n\\n\\n
\\n

Example Domain

\\n

This domain is for use in illustrative examples in documents. You may use this\\n domain in literature without prior coordination or asking for permission.

\\n

More information...

\\n
\\n\\n\\n'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urlreadtext(\"http://example.com\")" ] }, { "cell_type": "code", "execution_count": 22, "id": "5ec266ba", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n\\n\\n Example Domain\\n\\n \\n \\n \\n \\n\\n\\n\\n
\\n

Example Domain

\\n

This domain is for use in illustrative examples in documents. You may use this\\n domain in literature without prior coordination or asking for permission.

\\n

More information...

\\n
\\n\\n\\n'" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urlreadtext(\"http://example.com\")" ] }, { "cell_type": "markdown", "id": "7afe12cc", "metadata": {}, "source": [ "A function to generate URLs for MCS 275 related things." ] }, { "cell_type": "code", "execution_count": 23, "id": "aaed57fe", "metadata": {}, "outputs": [], "source": [ "def mcs275url(itemtype,number):\n", " urlbase = \"http://www.dumas.io/teaching/2022/spring/mcs275/\"\n", " sub_fmts = {\n", " \"homework\":\"nbview/homework/homework{}.html\",\n", " \"homework solution\":\"nbview/homework/homework{}soln.html\",\n", " \"worksheet\":\"nbview/worksheets/worksheet{}.html\",\n", " \"worksheet solution\":\"nbview/worksheets/worksheet{}soln.html\",\n", " \"project\":\"nbview/projects/project{}.html\",\n", " \"lecture\":\"slides/lecture{}.html\"\n", " }\n", " if itemtype not in sub_fmts:\n", " raise ValueError(\"itemtype '{}' unknown; must be one of {}\".format(itemtype,sub_fmts.keys()))\n", " return urlbase + sub_fmts[itemtype].format(number)" ] }, { "cell_type": "code", "execution_count": 24, "id": "88fd0e29", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture40.html'" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mcs275url(\"lecture\",40)" ] }, { "cell_type": "code", "execution_count": 25, "id": "494f5d90", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'http://www.dumas.io/teaching/2022/spring/mcs275/nbview/homework/homework12soln.html'" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mcs275url(\"homework solution\",12)" ] }, { "cell_type": "code", "execution_count": 26, "id": "cb719bad", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'http://www.dumas.io/teaching/2022/spring/mcs275/nbview/projects/project4.html'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mcs275url(\"project\",4) # reminder: Due Fri 29 April!" ] }, { "cell_type": "code", "execution_count": 27, "id": "b77f04d5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture40.html'\n" ] } ], "source": [ "s = urlreadtext(mcs275url(\"lecture\",40))" ] }, { "cell_type": "code", "execution_count": 29, "id": "3625aaf1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n\\n\\n \\n \")\n", "stop = s.find(\"\")\n", "print(\"The title of lecture 40 is:\",s[start+7:stop])" ] }, { "cell_type": "markdown", "id": "4541e12a", "metadata": {}, "source": [ "## Level 1 HTML parsing" ] }, { "cell_type": "code", "execution_count": 41, "id": "fb0f495b", "metadata": {}, "outputs": [], "source": [ "import html.parser\n", "\n", "class TitleExtractor(html.parser.HTMLParser):\n", " def __init__(self,*args,**kwargs):\n", " super().__init__(*args,**kwargs)\n", " self.listening = False\n", " self.captured = \"\"\n", " def handle_starttag(self, tag, attrs):\n", " #print(\"Start tag\",tag)\n", " if tag == \"title\":\n", " self.listening = True\n", " def handle_endtag(self, tag):\n", " #print(\"End tag\",tag)\n", " if tag == \"title\":\n", " self.listening = False\n", " def handle_data(self, data):\n", " if self.listening:\n", " self.captured += data" ] }, { "cell_type": "code", "execution_count": 44, "id": "de262d88", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The title of lecture 40 is: Lec 40: Parsing and scraping HTML\n" ] } ], "source": [ "X = TitleExtractor()\n", "X.feed(urlreadtext(mcs275url(\"lecture\",40)))\n", "print(\"The title of lecture 40 is:\",X.captured)" ] }, { "cell_type": "markdown", "id": "5765e454", "metadata": {}, "source": [ "## Level 2 HTML parsing" ] }, { "cell_type": "code", "execution_count": 53, "id": "b01bf130", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The title of lecture 40 is: Lec 40: Parsing and scraping HTML\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "\n", "soup = BeautifulSoup(\n", " urlreadtext(mcs275url(\"lecture\",40)),\n", " \"html.parser\" # Use Python's built-in parser\n", ")\n", "print(\"The title of lecture 40 is:\",soup.title.text)" ] }, { "cell_type": "markdown", "id": "0e14de75", "metadata": {}, "source": [ "## MCS 275 slides analysis" ] }, { "cell_type": "code", "execution_count": 54, "id": "99162275", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture1.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture2.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture3.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture4.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture5.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture6.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture7.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture8.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture9.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture10.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture11.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture12.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture13.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture14.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture15.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture16.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture17.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture18.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture19.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture20.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture21.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture22.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture23.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture24.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture25.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture26.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture27.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture28.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture29.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture30.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture31.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture32.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture33.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture34.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture35.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture36.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture37.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture38.html'\n", "Opening URL 'http://www.dumas.io/teaching/2022/spring/mcs275/slides/lecture39.html'\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "import time\n", "\n", "lecture_titles = {}\n", "for n in range(1,41):\n", " time.sleep(0.1)\n", " soup = BeautifulSoup(\n", " urlreadtext(mcs275url(\"lecture\",n)),\n", " \"html.parser\" # Use Python's built-in parser\n", " )\n", " lecture_titles[n] = soup.title.text" ] }, { "cell_type": "code", "execution_count": 55, "id": "a7c55771", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{1: 'Lec 1: Introduction',\n", " 2: 'Lec 2: Python tour part I',\n", " 3: 'Lec 3: Python tour part II',\n", " 4: 'Lec 4: Operator overloading',\n", " 5: 'Lec 5: Inheritance',\n", " 6: 'Lec 6: Inheritance II',\n", " 7: 'Lec 7: Notebooks',\n", " 8: 'Lec 8: Variadic functions and decorators',\n", " 9: 'Lec 9: Context Managers',\n", " 10: 'Lec 10: Errors and debugging',\n", " 11: 'Lec 11: pdb',\n", " 12: 'Lec 12: Recursion',\n", " 13: 'Lec 13: Recursion vs iteration',\n", " 14: 'Lec 14: Recursion vs iteration II',\n", " 15: 'Lec 15: Recursion with backtracking',\n", " 16: 'Lec 16: Mergesort',\n", " 17: 'Lec 17: Quicksort',\n", " 18: 'Lec 18: Comparison sorts',\n", " 19: 'Lec 19: Trees',\n", " 20: 'Lec 20: Binary Search Trees (BST)',\n", " 21: 'Lec 21: BST and tree traversals',\n", " 22: 'Lec 22: set and defaultdict',\n", " 23: 'Lec 23: CSV and JSON',\n", " 24: 'Lec 24: Pillow',\n", " 25: 'Lec 25: Numpy',\n", " 26: 'Lec 26: Numpy II',\n", " 27: 'Lec 27: Julia sets',\n", " 28: 'Lec 28: Matplotlib',\n", " 29: 'Lec 29: Matplotlib II',\n", " 30: 'Lec 30: Databases',\n", " 31: 'Lec 31: Machine learning',\n", " 32: 'Lec 32: SQL and SQLite II',\n", " 33: 'Lec 33: HTML and CSS',\n", " 34: 'Lec 34: Planning our web app',\n", " 35: 'Lec 35: HTTP and Flask',\n", " 36: 'Lec 36: Using Flask',\n", " 37: 'Lec 37: Forms',\n", " 38: 'Lec 38: Web app wrap-up',\n", " 39: 'Lec 39: HTTP Requests',\n", " 40: 'Lec 40: Parsing and scraping HTML'}" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lecture_titles" ] }, { "cell_type": "code", "execution_count": 56, "id": "58cd34ff", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", "soup = BeautifulSoup(\n", " urlreadtext(mcs275url(\"lecture\",40)),\n", " \"html.parser\" # Use Python's built-in parser\n", ")" ] }, { "cell_type": "code", "execution_count": 62, "id": "f5f04636", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "

Lecture 40

" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.find(\"section\").h1 # Does the first slide have an h1?" ] }, { "cell_type": "code", "execution_count": 67, "id": "a92097f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "21" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(soup.find_all(\"section\")) # number of slides in lecture 40" ] }, { "cell_type": "code", "execution_count": 73, "id": "20677e93", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "from bs4 import BeautifulSoup\n", "\n", "lecture_numbers = list(range(1,41))\n", "lecture_slide_counts = []\n", "for n in range(1,41):\n", " soup = BeautifulSoup(\n", " urlreadtext(mcs275url(\"lecture\",n)),\n", " \"html.parser\" # Use Python's built-in parser\n", " )\n", " lecture_slide_counts.append(len(soup.find_all(\"section\")))\n", " \n", "\n", "plt.bar(lecture_numbers,lecture_slide_counts)\n", "plt.title(\"Number of slides in MCS 275 lectures\")\n", "plt.xlabel(\"Lecture number\")\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }