From b38821ee75b85cfcf1803f88f2092b4c742db2ac Mon Sep 17 00:00:00 2001 From: Michael Foiani Date: Sat, 10 Apr 2021 03:26:13 -0400 Subject: Have a functional fetch and processing that uses the offical edgar api. Need to opitmize with parameters in the query and loading in more trades (max is 100 filings). This will help remove any need to get filings from the frontend, along with no issues with paying for an api. --- .../brown/cs/student/term/parsing/FilingFeed.java | 99 ++++++++++++++++++++++ .../cs/student/term/parsing/TxtXmlParser.java | 65 ++++++++++++++ .../cs/student/term/repl/commands/LoadCommand.java | 59 +++++++++---- 3 files changed, 206 insertions(+), 17 deletions(-) create mode 100644 src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java create mode 100644 src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java (limited to 'src/main/java/edu/brown') diff --git a/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java b/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java new file mode 100644 index 0000000..d41e918 --- /dev/null +++ b/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java @@ -0,0 +1,99 @@ +package edu.brown.cs.student.term.parsing; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import java.time.Instant; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import static org.junit.Assert.assertEquals; + +public class FilingFeed { + private final List filings; + + public FilingFeed(Document document) { + // Init array + filings = new ArrayList<>(); + + // Get all entries + NodeList entries = document.getElementsByTagName("entry"); + for (int i = 0; i < entries.getLength(); i++) { + // Assertion allows the cast to be ok :) + assert entries.item(i).getNodeType() == Node.ELEMENT_NODE; + Element entry = (Element) entries.item(i); + + // Get the timestamp from updated field + NodeList updated = entry.getElementsByTagName("updated"); + // Assert at least one element. + assert updated.getLength() == 1; + String timestamp = updated.item(0).getTextContent(); + ZonedDateTime zonedDateTime = ZonedDateTime.parse(timestamp); + Instant instant = zonedDateTime.toInstant(); + + NodeList link = entry.getElementsByTagName("link"); + assertEquals(link.getLength(), 1); + String linkUrl = link.item(0).getAttributes().getNamedItem("href").getNodeValue(); + + filings.add(new Filing(getXmlUrl(linkUrl), instant)); + } + } + + private String getXmlUrl(String filingUrl) { + return filingUrl.replace("-index.htm", ".txt"); + } + + public List getFilings() { + return filings; + } + + public static class Filing { + // TODO: update to be immutable + private final String xmlUrl; + private final Instant timestamp; + + public Filing(String xmlUrl, Instant timestamp) { + this.xmlUrl = xmlUrl; + this.timestamp = timestamp; + } + + public Instant getTimestamp() { + return timestamp; + } + + public String getXmlUrl() { + return xmlUrl; + } + + @Override + public String toString() { + return "Filing{" + + "xmlUrl='" + xmlUrl + '\'' + + ", timestamp=" + timestamp + + '}'; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Filing filing = (Filing) o; + return Objects.equals(xmlUrl, filing.xmlUrl) && + Objects.equals(timestamp, filing.timestamp); + } + + @Override + public int hashCode() { + return Objects.hash(xmlUrl, timestamp); + } + } + +} diff --git a/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java b/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java new file mode 100644 index 0000000..2b8016e --- /dev/null +++ b/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java @@ -0,0 +1,65 @@ +package edu.brown.cs.student.term.parsing; + +import org.w3c.dom.Document; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import javax.print.Doc; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; +import java.net.URL; +import java.net.URLConnection; + +public class TxtXmlParser extends XmlParser { + + public TxtXmlParser() { + super(); + } + + /** + * Method used to parse the xml file. + * + * @param pathToXml The path to the xml text file. + * @return The tree structure parsed as an xml doc. + */ + @Override + public Document parse(String pathToXml) { + try { + System.err.println("LOG: To make class for url: " + pathToXml + " in parse() of " + getClass()); + URL url = new URL(pathToXml); + System.err.println("LOG: To establish urlConnection in parse() of " + getClass()); + URLConnection conn = url.openConnection(); + conn.addRequestProperty("User-Agent", "Chrome"); + System.err.println("LOG: Making bufferedReader for url: " + pathToXml + " in " + getClass()); + BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream())); + + StringBuilder xmlParts = new StringBuilder(); + boolean isXml = false; + String line; + while ((line = br.readLine()) != null) { + if (line.equals("")) { + break; + } + + if (isXml) { + xmlParts.append(line); + } + if (line.equals("")) { + isXml = true; + } + } + System.err.println("LOG: Calling builder.parse() after extracting xml parts from: " + pathToXml + " in " + getClass()); + + InputSource xmlLines = new InputSource(new StringReader(xmlParts.toString())); + return builder.parse(xmlLines); + } catch (SAXException e) { + System.err.println("INTERNAL: SAX " + getClass() + " : " + e.getClass()); + } catch (IOException e) { + e.printStackTrace(); + System.err.println("INTERNAL: IO " + getClass() + " : " + e.getClass()); + } + return null; + } +} diff --git a/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java b/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java index 54f9fc0..a267322 100644 --- a/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java +++ b/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java @@ -2,8 +2,11 @@ package edu.brown.cs.student.term.repl.commands; import edu.brown.cs.student.term.DatabaseQuerier; import edu.brown.cs.student.term.Main; +import edu.brown.cs.student.term.parsing.FilingFeed; import edu.brown.cs.student.term.parsing.Transaction; +import edu.brown.cs.student.term.parsing.TxtXmlParser; import edu.brown.cs.student.term.parsing.UrlXmlParser; +import edu.brown.cs.student.term.parsing.XmlParser; import edu.brown.cs.student.term.repl.Command; import edu.brown.cs.student.term.trade.Trade; import org.json.JSONArray; @@ -18,7 +21,8 @@ import java.time.ZonedDateTime; public class LoadCommand implements Command { private Connection conn; - private final static UrlXmlParser URL_XML_PARSER = new UrlXmlParser(); + private final static XmlParser URL_XML_PARSER = new UrlXmlParser(); + private final static XmlParser TXT_XML_PARSER = new TxtXmlParser(); /** * Main run method for every command. @@ -27,23 +31,30 @@ public class LoadCommand implements Command { */ @Override public String run(String[] args) { - // TODO: add log comments + // param checking + if (args.length != 1){ + return "ERROR: Incorrect number of arguments for load command"; + } + + int numFilings; + try { + numFilings = Integer.parseInt(args[0]); + } catch (NumberFormatException e) { + return "ERROR: Please input an integer."; + } + + System.err.println("LOG: Entered .run() of " + getClass()); - // TODO: call to api for urls to call through the urlxmlparser from reagan - if (Main.xmlLinks == null) { - return "ERROR: Please load xml links from frontend."; + FilingFeed filings = getFilings(numFilings); + if (filings.getFilings().isEmpty()) { + System.err.println("WARNING: No filings loaded."); } conn = DatabaseQuerier.getConn(); - JSONArray data = Main.xmlLinks.getJSONArray("data"); - for(int i =0; i < data.length(); i++) { - JSONObject link = data.optJSONObject(i); - - String timestamp = link.getString("timestamp"); - String url = link.getString("url"); + for(FilingFeed.Filing filing : filings.getFilings()) { try { System.err.println("LOG: Calling loadTransactionIntoDB() in " + getClass()); - loadTransactionIntoDB(timestamp, url); + loadTransactionIntoDB(filing.getTimestamp(), filing.getXmlUrl()); } catch (SQLException throwables) { System.err.println("INTERNAL: SQLException in .run() of " + getClass()); //throwables.printStackTrace(); @@ -53,19 +64,33 @@ public class LoadCommand implements Command { return "Loaded?"; } + private FilingFeed getFilings(int numFilings) { + // TODO: make params more adjustable + String queryUrl = "https://www.sec.gov/cgi-bin/browse-edgar?" + + "action=getcurrent" + + "&CIK=" + + "&type=4" + + "&company=" + + "&dateb=" + + "&owner=only" + + "&start=0" + + "&count=" + numFilings + + "&output=atom"; + + Document document = URL_XML_PARSER.parse(queryUrl); + return new FilingFeed(document); + } + /** * Loads a whole transaction, which can have multiple trades, into the DB. * @param url The url to the public xml file. * @throws SQLException If the prep statement fails or db doesn't exist, throws. */ - private void loadTransactionIntoDB(String timestamp, String url) throws SQLException { + private void loadTransactionIntoDB(Instant instant, String url) throws SQLException { System.err.println("LOG: Parsing XML into transaction in loadTransactionIntoDB(). URL: " + url); // TODO: check if this is right @julia // TODO: add parse error handling... - ZonedDateTime zonedDateTime = ZonedDateTime.parse(timestamp); - Instant instant = zonedDateTime.toInstant(); - - Document document = URL_XML_PARSER.parse(url); + Document document = TXT_XML_PARSER.parse(url); if (document == null) { System.err.println("WARNING: URL " + url + " failed to parse... continuing."); return; -- cgit v1.2.3-70-g09d2 From 6244f9bcab93f225fdcf0e3cfb72103f3adedfb6 Mon Sep 17 00:00:00 2001 From: Michael Foiani Date: Sat, 10 Apr 2021 03:38:19 -0400 Subject: Added functionality to add more than 100 trades from the edgar api. It's looking pretty good :) --- data/trades.sqlite3 | Bin 36864 -> 49152 bytes .../cs/student/term/repl/commands/LoadCommand.java | 43 +++++++++++++-------- 2 files changed, 26 insertions(+), 17 deletions(-) (limited to 'src/main/java/edu/brown') diff --git a/data/trades.sqlite3 b/data/trades.sqlite3 index 436e41d..7c6f921 100644 Binary files a/data/trades.sqlite3 and b/data/trades.sqlite3 differ diff --git a/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java b/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java index a267322..b1e8cb4 100644 --- a/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java +++ b/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java @@ -18,6 +18,8 @@ import java.sql.PreparedStatement; import java.sql.SQLException; import java.time.Instant; import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.List; public class LoadCommand implements Command { private Connection conn; @@ -45,13 +47,14 @@ public class LoadCommand implements Command { System.err.println("LOG: Entered .run() of " + getClass()); - FilingFeed filings = getFilings(numFilings); - if (filings.getFilings().isEmpty()) { + List allFilings = getFilings(numFilings); + + if (allFilings.isEmpty()) { System.err.println("WARNING: No filings loaded."); } conn = DatabaseQuerier.getConn(); - for(FilingFeed.Filing filing : filings.getFilings()) { + for(FilingFeed.Filing filing : allFilings) { try { System.err.println("LOG: Calling loadTransactionIntoDB() in " + getClass()); loadTransactionIntoDB(filing.getTimestamp(), filing.getXmlUrl()); @@ -64,21 +67,27 @@ public class LoadCommand implements Command { return "Loaded?"; } - private FilingFeed getFilings(int numFilings) { + private List getFilings(int numFilings) { + List all = new ArrayList<>(); + int counter = 0; + while (100*counter <= numFilings) { + String queryUrl = "https://www.sec.gov/cgi-bin/browse-edgar?" + + "action=getcurrent" + + "&CIK=" + + "&type=4" + + "&company=" + + "&dateb=" + + "&owner=only" + + "&start=" + (100*counter++) + + "&count=" + 100 + + "&output=atom"; + System.out.println("LOG: Requesting filings with url: " + queryUrl); + Document document = URL_XML_PARSER.parse(queryUrl); + FilingFeed filingFeed = new FilingFeed(document); + all.addAll(filingFeed.getFilings()); + } // TODO: make params more adjustable - String queryUrl = "https://www.sec.gov/cgi-bin/browse-edgar?" + - "action=getcurrent" + - "&CIK=" + - "&type=4" + - "&company=" + - "&dateb=" + - "&owner=only" + - "&start=0" + - "&count=" + numFilings + - "&output=atom"; - - Document document = URL_XML_PARSER.parse(queryUrl); - return new FilingFeed(document); + return all; } /** -- cgit v1.2.3-70-g09d2 From ecd32acb9366a8ad9d634732b39cbdbdf4c8a53b Mon Sep 17 00:00:00 2001 From: Michael Foiani Date: Mon, 12 Apr 2021 11:03:02 -0400 Subject: Added way to load trades straight fron edgar with an archive option. --- data/trades.sqlite3 | Bin 49152 -> 344064 bytes src/main/java/edu/brown/cs/student/term/Main.java | 8 ++ .../brown/cs/student/term/parsing/FilingFeed.java | 87 +++++----------- .../cs/student/term/parsing/TxtXmlParser.java | 28 +++++- .../cs/student/term/parsing/UrlXmlParser.java | 2 + .../cs/student/term/repl/commands/LoadCommand.java | 109 ++++++++++++++++----- trades.sqlite3 | 0 7 files changed, 142 insertions(+), 92 deletions(-) create mode 100644 trades.sqlite3 (limited to 'src/main/java/edu/brown') diff --git a/data/trades.sqlite3 b/data/trades.sqlite3 index 7c6f921..c231e0f 100644 Binary files a/data/trades.sqlite3 and b/data/trades.sqlite3 differ diff --git a/src/main/java/edu/brown/cs/student/term/Main.java b/src/main/java/edu/brown/cs/student/term/Main.java index 2a75bd5..55b1634 100644 --- a/src/main/java/edu/brown/cs/student/term/Main.java +++ b/src/main/java/edu/brown/cs/student/term/Main.java @@ -98,12 +98,20 @@ public final class Main { parser.accepts("gui"); parser.accepts("port").withRequiredArg().ofType(Integer.class) .defaultsTo(DEFAULT_PORT); + parser.accepts("debug"); OptionSet options = parser.parse(args); if (options.has("gui")) { runSparkServer((int) options.valueOf("port")); } + if (!options.has("debug")) { + System.setErr(new PrintStream(new OutputStream() { + public void write(int b) { + } + })); + } + HashMap commandHashMap = new HashMap<>(); commandHashMap.put("setup", new SetupCommand()); commandHashMap.put("load", new LoadCommand()); diff --git a/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java b/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java index d41e918..aac6358 100644 --- a/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java +++ b/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java @@ -5,17 +5,19 @@ import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; -import java.time.Instant; -import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.List; -import java.util.Objects; - -import static org.junit.Assert.assertEquals; +/** + * Represents the filing from the Edgar rss feed. + */ public class FilingFeed { - private final List filings; + private final List filings; + /** + * Constructor that takes the parsed document and extracts the url. + * @param document The document of the rss feed. + */ public FilingFeed(Document document) { // Init array filings = new ArrayList<>(); @@ -27,73 +29,32 @@ public class FilingFeed { assert entries.item(i).getNodeType() == Node.ELEMENT_NODE; Element entry = (Element) entries.item(i); - // Get the timestamp from updated field - NodeList updated = entry.getElementsByTagName("updated"); - // Assert at least one element. - assert updated.getLength() == 1; - String timestamp = updated.item(0).getTextContent(); - ZonedDateTime zonedDateTime = ZonedDateTime.parse(timestamp); - Instant instant = zonedDateTime.toInstant(); - NodeList link = entry.getElementsByTagName("link"); - assertEquals(link.getLength(), 1); String linkUrl = link.item(0).getAttributes().getNamedItem("href").getNodeValue(); - filings.add(new Filing(getXmlUrl(linkUrl), instant)); + filings.add(getXmlUrl(linkUrl)); } } + /** + * Turns the local url into a publicly hosted one. + * @param filingUrl The local url of the .txt to the filing. + * @return The publicly hosted version of the url. + */ private String getXmlUrl(String filingUrl) { - return filingUrl.replace("-index.htm", ".txt"); + String url = filingUrl.replace("-index.htm", ".txt"); + if (!url.contains("https://www.sec.gov/")) { + url = "https://www.sec.gov/" + url; + } + return url; } - public List getFilings() { + /** + * Accessor that returns the url to the txt format of the filings. + * @return The list of publicly hosted urls to each filing. + */ + public List getFilings() { return filings; } - public static class Filing { - // TODO: update to be immutable - private final String xmlUrl; - private final Instant timestamp; - - public Filing(String xmlUrl, Instant timestamp) { - this.xmlUrl = xmlUrl; - this.timestamp = timestamp; - } - - public Instant getTimestamp() { - return timestamp; - } - - public String getXmlUrl() { - return xmlUrl; - } - - @Override - public String toString() { - return "Filing{" + - "xmlUrl='" + xmlUrl + '\'' + - ", timestamp=" + timestamp + - '}'; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - Filing filing = (Filing) o; - return Objects.equals(xmlUrl, filing.xmlUrl) && - Objects.equals(timestamp, filing.timestamp); - } - - @Override - public int hashCode() { - return Objects.hash(xmlUrl, timestamp); - } - } - } diff --git a/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java b/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java index 2b8016e..1be5f6f 100644 --- a/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java +++ b/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java @@ -4,7 +4,6 @@ import org.w3c.dom.Document; import org.xml.sax.InputSource; import org.xml.sax.SAXException; -import javax.print.Doc; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; @@ -12,10 +11,15 @@ import java.io.StringReader; import java.net.URL; import java.net.URLConnection; +/** + * Class that parses the XML contained within a publicly held txt file. + */ public class TxtXmlParser extends XmlParser { + private long timestamp; public TxtXmlParser() { super(); + timestamp = -1; } /** @@ -36,13 +40,22 @@ public class TxtXmlParser extends XmlParser { BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream())); StringBuilder xmlParts = new StringBuilder(); + boolean isXml = false; String line; while ((line = br.readLine()) != null) { + // Get timestamp + if (line.startsWith("")) { + String timestampString = line.replaceAll("", ""); + + // TODO: check for errors + this.timestamp = Long.parseLong(timestampString); + } + + // For xml if (line.equals("")) { break; } - if (isXml) { xmlParts.append(line); } @@ -62,4 +75,15 @@ public class TxtXmlParser extends XmlParser { } return null; } + + /** + * Returns the timestamp then resets it to -1. + * @return The timestamp as a number (long). -1 if not assigned. + */ + public long getTimestamp() { + long temp = timestamp; + // Set to -1 for next one... + timestamp = -1; + return temp; + } } diff --git a/src/main/java/edu/brown/cs/student/term/parsing/UrlXmlParser.java b/src/main/java/edu/brown/cs/student/term/parsing/UrlXmlParser.java index c89c31d..21cd7c5 100644 --- a/src/main/java/edu/brown/cs/student/term/parsing/UrlXmlParser.java +++ b/src/main/java/edu/brown/cs/student/term/parsing/UrlXmlParser.java @@ -6,8 +6,10 @@ import org.xml.sax.SAXException; import java.io.IOException; import java.net.URL; import java.net.URLConnection; +import java.time.Instant; public class UrlXmlParser extends XmlParser{ + public UrlXmlParser() { super(); } diff --git a/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java b/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java index b1e8cb4..541add2 100644 --- a/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java +++ b/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java @@ -24,7 +24,7 @@ import java.util.List; public class LoadCommand implements Command { private Connection conn; private final static XmlParser URL_XML_PARSER = new UrlXmlParser(); - private final static XmlParser TXT_XML_PARSER = new TxtXmlParser(); + private final static TxtXmlParser TXT_XML_PARSER = new TxtXmlParser(); /** * Main run method for every command. @@ -34,7 +34,7 @@ public class LoadCommand implements Command { @Override public String run(String[] args) { // param checking - if (args.length != 1){ + if (args.length != 1 && args.length !=2) { return "ERROR: Incorrect number of arguments for load command"; } @@ -45,62 +45,117 @@ public class LoadCommand implements Command { return "ERROR: Please input an integer."; } + boolean isArchive = false; + if (args.length == 2) { + isArchive = args[1].equals("archive"); + } + System.err.println("LOG: Entered .run() of " + getClass()); - List allFilings = getFilings(numFilings); + //List filingUrls = getFilings(numFilings); + getFilings(numFilings, isArchive); + + //loadFilings(filingUrls); + + return "Finished loading " + numFilings + " filings."; + } - if (allFilings.isEmpty()) { + /** + * Parses the urls to filings and loads them into the setup DB. + * @param urls The list of urls to parsable Edgar txt files. + */ + public void loadFilings(List urls) { + if (urls.isEmpty()) { System.err.println("WARNING: No filings loaded."); + return; } conn = DatabaseQuerier.getConn(); - for(FilingFeed.Filing filing : allFilings) { + for(String url : urls) { try { System.err.println("LOG: Calling loadTransactionIntoDB() in " + getClass()); - loadTransactionIntoDB(filing.getTimestamp(), filing.getXmlUrl()); + loadTransactionIntoDB(url); } catch (SQLException throwables) { System.err.println("INTERNAL: SQLException in .run() of " + getClass()); //throwables.printStackTrace(); } } - - return "Loaded?"; } - private List getFilings(int numFilings) { - List all = new ArrayList<>(); + /** + * Makes a request to the public Edgar url and parses it's rss feed. + * @param numFilings The number of filings to parse. + */ + private void getFilings(int numFilings, boolean isArchive) { int counter = 0; + while (100*counter <= numFilings) { - String queryUrl = "https://www.sec.gov/cgi-bin/browse-edgar?" + - "action=getcurrent" + - "&CIK=" + - "&type=4" + - "&company=" + - "&dateb=" + - "&owner=only" + - "&start=" + (100*counter++) + - "&count=" + 100 + - "&output=atom"; + /* + if (counter%10 == 0) { + + System.out.println("Starting wait"); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + System.out.println("End wait"); + } + */ + + + String queryUrl = + (isArchive) ? + "https://www.sec.gov/cgi-bin/srch-edgar?" + + "text=form-type%3D4" + + "&start=" + (100*counter++) + + "&count=" + 100 + + "&first=2020" + + "&last=2021" + + "&output=atom" + : + "https://www.sec.gov/cgi-bin/browse-edgar?" + + "action=getcurrent" + + "&CIK=" + + "&type=4" + + "&company=" + + "&dateb=" + + "&owner=only" + + "&start=" + (100*counter++) + + "&count=" + 100 + + "&output=atom"; + System.out.println("LOG: Requesting filings with url: " + queryUrl); Document document = URL_XML_PARSER.parse(queryUrl); + + if (document == null) { + System.err.println("WARNING: Document was null " + queryUrl + " in getFilings(): " + getClass()); + continue; + } + FilingFeed filingFeed = new FilingFeed(document); - all.addAll(filingFeed.getFilings()); + loadFilings(filingFeed.getFilings()); + + if (counter%10 == 0) { + System.out.println("PROGRESS: " + counter*100 + "/" + numFilings); + } } // TODO: make params more adjustable - return all; } + /** * Loads a whole transaction, which can have multiple trades, into the DB. * @param url The url to the public xml file. * @throws SQLException If the prep statement fails or db doesn't exist, throws. */ - private void loadTransactionIntoDB(Instant instant, String url) throws SQLException { + private void loadTransactionIntoDB(String url) throws SQLException { System.err.println("LOG: Parsing XML into transaction in loadTransactionIntoDB(). URL: " + url); // TODO: check if this is right @julia // TODO: add parse error handling... Document document = TXT_XML_PARSER.parse(url); - if (document == null) { + long timestamp = TXT_XML_PARSER.getTimestamp(); + if (document == null || timestamp == -1) { System.err.println("WARNING: URL " + url + " failed to parse... continuing."); return; } @@ -111,7 +166,7 @@ public class LoadCommand implements Command { for(Trade trade : helper.getTrades()) { System.err.println("LOG: Loading a trade into DB -> " + trade); - loadTradeIntoDB(instant, trade); + loadTradeIntoDB(timestamp, trade); System.err.println("LOG: Loaded that trade."); } } catch (Exception e) { @@ -124,7 +179,7 @@ public class LoadCommand implements Command { * @param trade The trade to be loaded. * @throws SQLException If the prep statement fails or db doesn't exist, throws. */ - private void loadTradeIntoDB(Instant instant, Trade trade) throws SQLException { + private void loadTradeIntoDB(long timestamp, Trade trade) throws SQLException { // current table schema that is used... // TODO: make this TABLE with this SCHEMA if doesn't exist. /* @@ -149,7 +204,7 @@ public class LoadCommand implements Command { prep.setString(1, trade.getStock()); prep.setString(2, trade.getHolder().getName()); // TODO: update with timestamp @julia - prep.setLong(3, instant.toEpochMilli()); + prep.setLong(3, timestamp); prep.setInt(4, trade.isBuy() ? 1 : 0); prep.setInt(5, trade.getNumShares()); prep.setInt(6, trade.getHolder().getId()); diff --git a/trades.sqlite3 b/trades.sqlite3 new file mode 100644 index 0000000..e69de29 -- cgit v1.2.3-70-g09d2 From 6dd133454b9c6c6d666a2dd17dd455ffd66c9937 Mon Sep 17 00:00:00 2001 From: Michael Foiani Date: Fri, 16 Apr 2021 11:23:20 -0400 Subject: Small bug fixes. --- src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java | 1 + 1 file changed, 1 insertion(+) (limited to 'src/main/java/edu/brown') diff --git a/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java b/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java index 1be5f6f..ccb8863 100644 --- a/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java +++ b/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java @@ -10,6 +10,7 @@ import java.io.InputStreamReader; import java.io.StringReader; import java.net.URL; import java.net.URLConnection; +import java.time.Instant; /** * Class that parses the XML contained within a publicly held txt file. -- cgit v1.2.3-70-g09d2 From 9a8483885977d6ca17344d465e431f1f2cdafc06 Mon Sep 17 00:00:00 2001 From: Michael Foiani Date: Sat, 17 Apr 2021 21:42:13 -0400 Subject: Created a stable load command that uses thhe official edgar rss feed. Had an option to load mose recent trades or trades from a certain timefrant. --- data/trades.sqlite3 | Bin 344064 -> 16539648 bytes .../brown/cs/student/term/parsing/FilingFeed.java | 2 +- .../cs/student/term/parsing/TxtXmlParser.java | 19 ++++- .../cs/student/term/repl/commands/LoadCommand.java | 80 +++++++++++++-------- 4 files changed, 66 insertions(+), 35 deletions(-) (limited to 'src/main/java/edu/brown') diff --git a/data/trades.sqlite3 b/data/trades.sqlite3 index c231e0f..878261a 100644 Binary files a/data/trades.sqlite3 and b/data/trades.sqlite3 differ diff --git a/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java b/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java index aac6358..b5a6acf 100644 --- a/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java +++ b/src/main/java/edu/brown/cs/student/term/parsing/FilingFeed.java @@ -44,7 +44,7 @@ public class FilingFeed { private String getXmlUrl(String filingUrl) { String url = filingUrl.replace("-index.htm", ".txt"); if (!url.contains("https://www.sec.gov/")) { - url = "https://www.sec.gov/" + url; + url = "https://www.sec.gov" + url; } return url; } diff --git a/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java b/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java index ccb8863..2e30fa7 100644 --- a/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java +++ b/src/main/java/edu/brown/cs/student/term/parsing/TxtXmlParser.java @@ -10,12 +10,16 @@ import java.io.InputStreamReader; import java.io.StringReader; import java.net.URL; import java.net.URLConnection; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.time.Instant; /** * Class that parses the XML contained within a publicly held txt file. */ public class TxtXmlParser extends XmlParser { + public final static SimpleDateFormat TIMECONVERTER = new SimpleDateFormat("yyyyMMddHHmmss"); + private long timestamp; public TxtXmlParser() { @@ -47,10 +51,9 @@ public class TxtXmlParser extends XmlParser { while ((line = br.readLine()) != null) { // Get timestamp if (line.startsWith("")) { - String timestampString = line.replaceAll("", ""); - + String datetime = line.replaceAll("", ""); // TODO: check for errors - this.timestamp = Long.parseLong(timestampString); + this.timestamp = formatTimestamp(datetime); } // For xml @@ -77,6 +80,16 @@ public class TxtXmlParser extends XmlParser { return null; } + public long formatTimestamp(String datetime) { + long timestamp = -1; + try { + timestamp = TIMECONVERTER.parse(datetime).toInstant().toEpochMilli(); + } catch (ParseException e) { + e.printStackTrace(); + } + return timestamp; + } + /** * Returns the timestamp then resets it to -1. * @return The timestamp as a number (long). -1 if not assigned. diff --git a/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java b/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java index 541add2..00ba3ad 100644 --- a/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java +++ b/src/main/java/edu/brown/cs/student/term/repl/commands/LoadCommand.java @@ -34,32 +34,59 @@ public class LoadCommand implements Command { @Override public String run(String[] args) { // param checking - if (args.length != 1 && args.length !=2) { + if (args.length != 1 && args.length !=2 && args.length !=3) { return "ERROR: Incorrect number of arguments for load command"; } int numFilings; try { numFilings = Integer.parseInt(args[0]); + if (numFilings <=0) { + return "ERROR: Please input an positive integer for number of filings."; + } } catch (NumberFormatException e) { - return "ERROR: Please input an integer."; + return "ERROR: Please input an integer for number of filings."; } - boolean isArchive = false; - if (args.length == 2) { - isArchive = args[1].equals("archive"); + int shift = 0; + try { + if (args.length == 2) { + shift = Integer.parseInt(args[1]); + if (shift <=0) { + return "ERROR: Please input an positive integer for the count shift."; + } + } + } catch (NumberFormatException e) { + return "ERROR: Please input an integer for the shift."; + } + + String filingDate = null; + if (args.length == 3) { + filingDate = args[2]; + System.out.println("WARNING: The archive version of the command make take " + + "a long time if a broad query param is inputted."); } System.err.println("LOG: Entered .run() of " + getClass()); //List filingUrls = getFilings(numFilings); - getFilings(numFilings, isArchive); + getFilings(numFilings, shift, filingDate); //loadFilings(filingUrls); return "Finished loading " + numFilings + " filings."; } + private void timeout() { + // System.out.println("timeout 100 mil"); + try { + Thread.sleep(100); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + /** * Parses the urls to filings and loads them into the setup DB. * @param urls The list of urls to parsable Edgar txt files. @@ -86,29 +113,17 @@ public class LoadCommand implements Command { * Makes a request to the public Edgar url and parses it's rss feed. * @param numFilings The number of filings to parse. */ - private void getFilings(int numFilings, boolean isArchive) { + private void getFilings(int numFilings, int shift, String filingDate) { int counter = 0; - while (100*counter <= numFilings) { - /* - if (counter%10 == 0) { - - System.out.println("Starting wait"); - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - System.out.println("End wait"); - } - */ - + while (100*counter <= (numFilings - shift)) { + timeout(); String queryUrl = - (isArchive) ? + (filingDate != null) ? "https://www.sec.gov/cgi-bin/srch-edgar?" + - "text=form-type%3D4" + - "&start=" + (100*counter++) + + "text=form-type%3D4+and+(filing-date%3D" + filingDate + ")" + + "&start=" + (100*counter++ + shift) + "&count=" + 100 + "&first=2020" + "&last=2021" + @@ -121,13 +136,12 @@ public class LoadCommand implements Command { "&company=" + "&dateb=" + "&owner=only" + - "&start=" + (100*counter++) + + "&start=" + (100*counter++ + shift) + "&count=" + 100 + "&output=atom"; - System.out.println("LOG: Requesting filings with url: " + queryUrl); + System.err.println("LOG: Requesting filings with url: " + queryUrl); Document document = URL_XML_PARSER.parse(queryUrl); - if (document == null) { System.err.println("WARNING: Document was null " + queryUrl + " in getFilings(): " + getClass()); continue; @@ -153,6 +167,8 @@ public class LoadCommand implements Command { System.err.println("LOG: Parsing XML into transaction in loadTransactionIntoDB(). URL: " + url); // TODO: check if this is right @julia // TODO: add parse error handling... + // timeout to reduce the too many requests + timeout(); Document document = TXT_XML_PARSER.parse(url); long timestamp = TXT_XML_PARSER.getTimestamp(); if (document == null || timestamp == -1) { @@ -166,7 +182,7 @@ public class LoadCommand implements Command { for(Trade trade : helper.getTrades()) { System.err.println("LOG: Loading a trade into DB -> " + trade); - loadTradeIntoDB(timestamp, trade); + loadTradeIntoDB(timestamp, trade, url); System.err.println("LOG: Loaded that trade."); } } catch (Exception e) { @@ -179,7 +195,7 @@ public class LoadCommand implements Command { * @param trade The trade to be loaded. * @throws SQLException If the prep statement fails or db doesn't exist, throws. */ - private void loadTradeIntoDB(long timestamp, Trade trade) throws SQLException { + private void loadTradeIntoDB(long timestamp, Trade trade, String url) throws SQLException { // current table schema that is used... // TODO: make this TABLE with this SCHEMA if doesn't exist. /* @@ -192,14 +208,15 @@ public class LoadCommand implements Command { number_of_shares INTEGER, holder_id INTEGER, share_price NUMERIC, + filing_url TEXT UNIQUE (trade_timestamp, is_buy, number_of_shares, holder_id, share_price)); */ System.err.println("LOG: Setting prepared statement on " + conn); PreparedStatement prep = conn.prepareStatement( "INSERT INTO trades (stock_name, holder_name, trade_timestamp, is_buy, " + - "number_of_shares, holder_id, share_price) " + - "VALUES (?, ?, ?, ?, ?, ?, ?)"); + "number_of_shares, holder_id, share_price, filing_url) " + + "VALUES (?, ?, ?, ?, ?, ?, ?, ?)"); prep.setString(1, trade.getStock()); prep.setString(2, trade.getHolder().getName()); @@ -209,6 +226,7 @@ public class LoadCommand implements Command { prep.setInt(5, trade.getNumShares()); prep.setInt(6, trade.getHolder().getId()); prep.setDouble(7, trade.getPrice()); + prep.setString(8, url); System.err.println("LOG: Inserted values into prep statement."); prep.execute(); -- cgit v1.2.3-70-g09d2