aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/.DS_Storebin6148 -> 6148 bytes
-rw-r--r--src/buxton/scraper.py69
-rw-r--r--src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docxbin0 -> 1675500 bytes
-rw-r--r--src/buxton/source/Bill_Notes_Braun_T3.docxbin0 -> 1671968 bytes
-rw-r--r--src/buxton/source/Bill_Notes_CasioC801.docxbin0 -> 574664 bytes
-rw-r--r--src/buxton/source/Bill_Notes_Casio_Mini.docxbin0 -> 581069 bytes
-rw-r--r--src/buxton/source/Bill_Notes_FingerWorks_Prototype.docxbin0 -> 585090 bytes
-rw-r--r--src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docxbin0 -> 1722555 bytes
-rw-r--r--src/buxton/source/Bill_Notes_FrogPad.docxbin0 -> 840173 bytes
-rw-r--r--src/buxton/source/Bill_Notes_Gavilan_SC.docxbin0 -> 1695290 bytes
-rw-r--r--src/buxton/source/Bill_Notes_Grandjean_Stenotype.docxbin0 -> 2094142 bytes
-rw-r--r--src/buxton/source/Bill_Notes_Matias.docxbin0 -> 590407 bytes
-rw-r--r--src/buxton/source/Bill_Notes_MousePen.docxbin0 -> 505322 bytes
-rw-r--r--src/buxton/source/Bill_Notes_NewO.docxbin0 -> 2264571 bytes
-rw-r--r--src/buxton/source/Bill_Notes_OLPC.docxbin0 -> 6883659 bytes
-rw-r--r--src/buxton/source/Bill_Notes_PARCkbd.docxbin0 -> 631959 bytes
-rw-r--r--src/buxton/source/Bill_Notes_Philco_Mystery_Control.docxbin0 -> 1994439 bytes
-rw-r--r--src/buxton/source/Bill_Notes_TASA_Kbd.docxbin0 -> 461199 bytes
-rw-r--r--src/buxton/source/Bill_Notes_The_Tap.docxbin0 -> 711321 bytes
19 files changed, 46 insertions, 23 deletions
diff --git a/src/.DS_Store b/src/.DS_Store
index d70e95c0a..071dafa1e 100644
--- a/src/.DS_Store
+++ b/src/.DS_Store
Binary files differ
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py
index 0abebb485..854c99379 100644
--- a/src/buxton/scraper.py
+++ b/src/buxton/scraper.py
@@ -8,9 +8,12 @@ import shutil
import uuid
source = "./source"
-dist = "./Dash-Web/src/server/public/files"
+dist = "../server/public/files"
+
+db = MongoClient("localhost", 27017)["Dash"]
+db.buxton.drop()
+collection_handle = db.buxton
-collection_handle = MongoClient("localhost", 27017)["Dash"]["buxton"]
def extract_links(fileName):
links = []
@@ -22,8 +25,11 @@ def extract_links(fileName):
links.append(item._target)
return links
+
def extract_value(kv_string):
- return kv_string.split(":")[1].strip()
+ pieces = kv_string.split(":")
+ return (pieces[1] if len(pieces) > 1 else kv_string).strip()
+
def mkdir_if_absent(path):
try:
@@ -32,6 +38,7 @@ def mkdir_if_absent(path):
except OSError:
print("Failed to create the appropriate directory structures for %s" % file_name)
+
def parse_document(file_name: str):
result = {}
pure_name = file_name.split(".")[0]
@@ -41,15 +48,18 @@ def parse_document(file_name: str):
raw = str(docx2txt.process(source + "/" + file_name, dir_path))
- sanitize = lambda line: re.sub("[\n\t]+", "", line).strip().replace(u"\u00A0", " ").replace(u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''')
- remove_empty = lambda line: len(line) > 1
+ def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
+ u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
+
+ def remove_empty(line): return len(line) > 1
lines = list(map(sanitize, raw.split("\n")))
lines = list(filter(remove_empty, lines))
result["file_name"] = file_name
- result["title"] = lines[2]
- result["short_description"] = lines[3].replace("Short Description: ", "")
+ result["title"] = lines[2].strip()
+ result["short_description"] = lines[3].strip().replace(
+ "Short Description: ", "")
cur = 5
notes = ""
@@ -57,32 +67,44 @@ def parse_document(file_name: str):
notes += lines[cur] + " "
cur += 1
result["buxton_notes"] = notes.strip()
-
+
cur += 1
- clean = list(map(lambda data: data.strip().split(":"), lines[cur].split("|")))
- result["company"] = clean[0][1].strip()
- result["year"] = clean[1][1].strip()
- result["original_price"] = clean[2][1].strip()
+ clean = list(
+ map(lambda data: data.strip().split(":"), lines[cur].split("|")))
+ result["company"] = clean[0][len(clean[0]) - 1].strip()
+ result["year"] = clean[1][len(clean[1]) - 1].strip()
+ result["original_price"] = clean[2][len(clean[2]) - 1].strip()
cur += 1
result["degrees_of_freedom"] = extract_value(lines[cur])
cur += 1
- result["dimensions"] = extract_value(lines[cur])
- cur += 2
+ dimensions = lines[cur].lower()
+ if dimensions.startswith("dimensions"):
+ result["dimensions"] = dimensions[11:].strip()
+ cur += 1
+ while lines[cur] != "Key Words":
+ result["dimensions"] += (" " + lines[cur].strip())
+ cur += 1
+
+ cur += 1
result["primary_key"] = extract_value(lines[cur])
cur += 1
result["secondary_key"] = extract_value(lines[cur])
- result["hyperlinks"] = extract_links(source + "/" + file_name)
+ while lines[cur] != "Links":
+ result["secondary_key"] += (" " + extract_value(lines[cur]).strip())
+ cur += 1
- cur += 2
+ cur += 1
link_descriptions = []
while lines[cur] != "Image":
- link_descriptions.append(lines[cur])
+ link_descriptions.append(lines[cur].strip())
cur += 1
result["link_descriptions"] = link_descriptions
+ result["hyperlinks"] = extract_links(source + "/" + file_name)
+
images = []
captions = []
cur += 3
@@ -99,9 +121,11 @@ def parse_document(file_name: str):
while cur < len(lines):
notes.append(lines[cur])
cur += 1
- result["notes"] = notes
+ if len(notes) > 0:
+ result["notes"] = notes
+
+ return result
- return result
def upload(document):
wrapper = {}
@@ -110,11 +134,13 @@ def upload(document):
wrapper["__type"] = "Doc"
collection_handle.insert_one(wrapper)
+
if os.path.exists(dist):
shutil.rmtree(dist)
-while (os.path.exists(dist)):
+while os.path.exists(dist):
pass
os.mkdir(dist)
+mkdir_if_absent(source)
for file_name in os.listdir(source):
if file_name.endswith('.docx'):
@@ -123,6 +149,3 @@ for file_name in os.listdir(source):
lines = ['*', '!.gitignore']
with open(dist + "/.gitignore", 'w') as f:
f.write('\n'.join(lines))
-
-
-
diff --git a/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
new file mode 100644
index 000000000..06094b4d3
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_Braun_T3.docx b/src/buxton/source/Bill_Notes_Braun_T3.docx
new file mode 100644
index 000000000..356697092
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_Braun_T3.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_CasioC801.docx b/src/buxton/source/Bill_Notes_CasioC801.docx
new file mode 100644
index 000000000..cd89fb97b
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_CasioC801.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_Casio_Mini.docx b/src/buxton/source/Bill_Notes_Casio_Mini.docx
new file mode 100644
index 000000000..a503cddfc
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_Casio_Mini.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx
new file mode 100644
index 000000000..4d13a8cf5
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx
new file mode 100644
index 000000000..578a1be08
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_FrogPad.docx b/src/buxton/source/Bill_Notes_FrogPad.docx
new file mode 100644
index 000000000..d01e1bf5c
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_FrogPad.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/buxton/source/Bill_Notes_Gavilan_SC.docx
new file mode 100644
index 000000000..7bd28b376
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_Gavilan_SC.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
new file mode 100644
index 000000000..0615c4953
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_Matias.docx b/src/buxton/source/Bill_Notes_Matias.docx
new file mode 100644
index 000000000..547603256
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_Matias.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_MousePen.docx b/src/buxton/source/Bill_Notes_MousePen.docx
new file mode 100644
index 000000000..4e1056636
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_MousePen.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_NewO.docx b/src/buxton/source/Bill_Notes_NewO.docx
new file mode 100644
index 000000000..a514926d2
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_NewO.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_OLPC.docx b/src/buxton/source/Bill_Notes_OLPC.docx
new file mode 100644
index 000000000..bfca0a9bb
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_OLPC.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_PARCkbd.docx b/src/buxton/source/Bill_Notes_PARCkbd.docx
new file mode 100644
index 000000000..c0cf6ba9a
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_PARCkbd.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx
new file mode 100644
index 000000000..ad06903f3
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/buxton/source/Bill_Notes_TASA_Kbd.docx
new file mode 100644
index 000000000..e4c659de9
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_TASA_Kbd.docx
Binary files differ
diff --git a/src/buxton/source/Bill_Notes_The_Tap.docx b/src/buxton/source/Bill_Notes_The_Tap.docx
new file mode 100644
index 000000000..8ceebc71e
--- /dev/null
+++ b/src/buxton/source/Bill_Notes_The_Tap.docx
Binary files differ