Farewell to Disks: Efficient Processing of Obstinate Data

467 views

Published on

Professor Diomidis Spinellis gave a lecture on Farewell to Disks: Efficient Processing of Obstinate Data in the Distinguished Lecturer Series - Leon The Mathematician.

Published in: Education, Technology, Business
0 Comments
0 Likes
Statistics
Notes
  • Be the first to comment

  • Be the first to like this

No Downloads
Views
Total views
467
On SlideShare
0
From Embeds
0
Number of Embeds
3
Actions
Shares
0
Downloads
1
Comments
0
Likes
0
Embeds 0
No embeds

No notes for slide

Farewell to Disks: Efficient Processing of Obstinate Data

  1. 1. Αποχαιρετισμός στους Δίσκους: Αποδοτική Επεξεργασία Περίπλοκων ΔεδομένωνΔιομήδης ΣπινέλληςΚαθηγητήςΤμήμα Διοικητικής Επιστήμης και ΤεχνολογίαςΟικονομικό Πανεπιστήμιο Αθηνώνhttp://www.dmst.aueb.gr/dds 1 1
  2. 2. 2
  3. 3. 3
  4. 4. 4
  5. 5. 5
  6. 6. 1 6
  7. 7. 7
  8. 8. 8
  9. 9. 9
  10. 10. 2 10
  11. 11. 11
  12. 12. 3 12
  13. 13. 13
  14. 14. Worst case latency (Log scale) L1 D cache L2 cache DDR RAM Hard disk 1.3 ns 9.7 ns 28.5 ns 25.6 ms 14
  15. 15. 15
  16. 16. 16
  17. 17. 17
  18. 18. 1 18
  19. 19. Time (Log scale) Function call System call Local IPC Remote IPC 1.3ns 1.9μs 4.3μs 1.2ms select Locations.cc1, Divisions.name, avg(CO2), count(*), Locations.lat, Locations.long, POPDENSITY.DENSITY from Papers inner join Locations on Papers.confLocId = Locations.id inner join Divisions on Locations.cc1 = Divisions.country inner join POPDENSITY on Divisions.name = upper(POPDENSITY.name) where Divisions.code = 00 and CO2 notnull group by Locations.cc1 having count(*) > 20 order by avg(CO2) desc; 19
  20. 20. /* Get the data */if (mcSet.dataLen) { data = xmalloc(mcSet.dataLen); if (lseek(fd, mcSet.data.off, SEEK_SET) == -1) CORRUPT(); if (read(fd, data, mcSet.dataLen) != mcSet.dataLen) CORRUPT(); if (lseek(fd, mcSet.u.firstMsg, SEEK_SET) == -1) CORRUPT(); for (i = 0; i < mcSet.numMsgs; ++i) { if (read(fd, &mcMsg, sizeof(mcMsg)) != sizeof(mcMsg)) CORRUPT(); if (mcMsg.invalid) { --i; continue; } msg = xmalloc(sizeof(msgT)); memset(msg, 0, sizeof(*msg)); /* […] */ msg->msgId = mcMsg.msgId; msg->str = xstrdup((char *) (data + mcMsg.msg.off)); } free(data);} 2 20
  21. 21. MMAP(2) FreeBSD System CallsManual MMAP(2)NAME mmap -- allocate memory, or map files ordevices into memorySYNOPSIS #include <sys/mman.h> void * mmap(void *addr, size_t len, int prot, intflags, int fd, off_t offset);DESCRIPTION The mmap() system call causes the pagesstarting at addr and continuing for at most lenbytes to be mapped from the object described byfd, starting at byte offset offset. 21
  22. 22. [dds@istlab /usr/src/sys/vm]$ lsdefault_pager.c uma_int.h vm_page.cdevice_pager.c vm.h vm_page.hmemguard.c vm_contig.c vm_pageout.cmemguard.h vm_extern.h vm_pageout.hphys_pager.c vm_fault.c vm_pager.cpmap.h vm_glue.c vm_pager.hredzone.c vm_init.c vm_param.hredzone.h vm_kern.c vm_phys.csg_pager.c vm_kern.h vm_phys.hswap_pager.c vm_map.c vm_reserv.cswap_pager.h vm_map.h vm_reserv.huma.h vm_meter.c vm_unix.cuma_core.c vm_mmap.c vm_zeroidle.cuma_dbg.c vm_object.c vnode_pager.cuma_dbg.h vm_object.h vnode_pager.h 22
  23. 23. 3 23
  24. 24. $ ls -lh sparse-rw-r--r-- 1 dds dds 500GMar 19 20:32 sparse$ du -h sparse28K sparse 4 διεργασία 1 διεργασία 2 r/o r/o φυσική μνήμη 24
  25. 25. διεργασία 1 διεργασία 2 r/w r/w φυσική μνήμηδιεργασία 1 διεργασία 2 read read φυσική μνήμηδιεργασία 1 διεργασία 2 read r/w αντίγραφο φυσική μνήμη 25
  26. 26. 5C++ 26
  27. 27. e.g. 1 CC-BY 2.5 Claudio Rocchini 27
  28. 28. 01110010011 0111101101101011 0000101101110011 00101 // romane01110010011 0111101101101011 0000101101110011 1010101110011 // romanus01110010011 0111101101101011 10101011011000111010101110011 // romulus01110010011 10101011000100110 0101011 0111001110011 // rubens01110010011 10101011000100110 0101011 10010 // ruber01110010011 10101011000100110 100101100011011 0111101101110 // rubicon01110010011 10101011000100110 100101100011011 1010101101110011001000111 010101110011 // rubicundus Δομή Δομή δίσκου Ιστοσελίδα μνήμης με νέους δεσμούς Κατασκευή Κατασκευή δένδρου δομής wikipedialize ριζών δίσκου Κατάλογος Αρχική άρθρων ιστοσελίδα for (;;) { i = bitpos; // Loop until the end of the current node or the end of the word while (i < p->end && i < len * 8) { // Covering whole byte? if (i % 8 == 0 && i + 8 <= p->end && (i + 8) / 8 <= len && data[i / 8] == p->data[i / 8]) { i += 8; continue; } // Split point if (getbit(data, i) != getbit(p->data, i)) { // Node with the new data struct pnode *n = new_node(data + i / 8, i % 8, (len - i / 8) * 8, NULL, NULL, true); // Tail of the current node struct pnode *t = new_node(p->data + i / 8, i % 8, p->end - (i & ~7), p->zero, p->one, p->is_terminal); // Head of current node if (getbit(data, i)) *p2 = new_node(p->data, bitpos, i, t, n, false); else *p2 = new_node(p->data, bitpos, i, n, t, false); free(p); return; } i++; } // while 28
  29. 29. // Write the given node to the specified file, returning its file offset.// On return the files offset is set to the first free byte.static longwrite_node(struct pnode *p, FILE *f){ long my_offset = ftell(f); size_t ret; if (p->one) { struct pnode_disk_one pdo; size_t dlen = datalen(p->end); long len = sizeof(pdo) + dlen; fseek(f, len, SEEK_CUR); pdo.h.type = dt_one; pdo.h.is_terminal = p->is_terminal; pdo.h.has_zero = (p->zero != NULL); pdo.h.has_one = true; pdo.h.begin = p->begin; pdo.h.end = p->end; if (p->zero) write_node(p->zero, f); pdo.one = write_node(p->one, f); long saved_offset = ftell(f); fseek(f, my_offset, SEEK_SET); fwrite(&pdo, 1, sizeof(pdo), f); fwrite(p->data, 1, dlen, f); fseek(f, saved_offset, SEEK_SET); return my_offset; } else {$ zcat enwiki-latest-all-titles-in-ns0.gz | wc -c106,237,053$ wc -c enwiki.pt144,657,286 enwiki.pt 29
  30. 30. $ curlhttp://www.kiosek.com/dostoevsky/library/crimeandpunishment.txt |perl -pe s/[rn]/ /g >crimeandpunishment.txt 30
  31. 31. $ wc crimeandpunishment.txt 0 203,273 1,462,661 crimeandpunishment.txt$ time ./wpltest en en_US.UTF-8 ISO-8859-1data/enwiki.pt <crimeandpunishment.txt >/dev/null$ time ./wpltest en en_US.UTF-8 ISO-8859-1data/enwiki.pt <crimeandpunishment.txt >/dev/nullChecked 406,225 prefixesreal 0m5.859s # Cold cachereal 0m1.876s # Warm cacheuser 0m1.780ssys 0m0.090s 31
  32. 32. // Prevent memory alignment problemsmemcpy(&end, &(p.h->end), sizeof(end)); /*while (i < end && i < len * 8) { if (i % 8 == 0 && i + 8 <= end && data[i / 8] == pdata[i / 8]) { */ * to understand this * You are not expected i += 8; prefix += 8; continue; } // Split point if (getbit(data, i) != getbit(pdata, i)) return best / 8; i++; prefix++;}if (i == end && p.h->is_terminal) best = prefix;if (i == len * 8) return best / 8;// Move to next nodebitpos = end % 8;int covered = end / 8;if (getbit(data, end)) { if (!p.h->has_one) return best / 8; switch (p.h->type) { case dt_both: p.h = (struct pnode_disk_head *)(base + p.b->one); break; case dt_one: p.h = (struct pnode_disk_head *)(base + p.o->one); break; case dt_short: default: assert(0); }} else { if (!p.h->has_zero) return best / 8; switch (p.h->type) { case dt_both: p.h = (struct pnode_disk_head *)(base + p.b->zero); break; case dt_one: // Advance to the end of this node p.h = (struct pnode_disk_head *)((char *)p.h + sizeof(struct pnode_disk_one) + datalen(end)); break; case dt_short: // Advance to the end of this node p.h = (struct pnode_disk_head *)((char *)p.h + sizeof(struct pnode_disk_short) + datalen(end)); break; default: assert(0); }} 32
  33. 33. e.g. 2The problem with wikipedia 33
  34. 34. Κατασκευή γράφου ΔομήΛίστα ακμών δεδομένων γράφου Κορυφές Διαδρομή αρχής, τέλους BFS Δομή δεδομένων γράφου Κορυφές Διαδρομή αρχής, τέλουςΚατασκευή BFS γράφου ΔομήΛίστα ακμών δεδομένων γράφου 34
  35. 35. Κατασκευή γράφου Δομή Λίστα ακμών δεδομένων γράφου// Loop through all lines,// adding them to the graph while (std::getline(in, line)) { int split = line.find(001); if (split == std::string::npos) { std::cerr << "No separator: " << line << std::endl; continue; } n.setName(line.substr(0, split)); NodesIter from(entries->insert(n).first); n.setName(line.substr(split + 1)); NodesIter to(entries->insert(n).first); (const_cast<Node &>(*from)).addEdge( const_cast<Node *>(&*to)); } Κορυφές Διαδρομή αρχής, τέλους Κατασκευή BFS γράφου Δομή Λίστα ακμών δεδομένων γράφου 35
  36. 36. Διαδρομή BFS Δομή δεδομένων γράφου Tacoma Narrows Bridge p= Suspension bridge Washingtonp= p= Geneva Montanap= p= Ουρά []=William Howard Taftp= Tacoma Narrows Bridge p= Suspension bridge Washingtonp= p= Geneva Montanap= p= Ουρά []=Tacoma Narrow BridgeWilliam Howard Taftp= 36
  37. 37. Tacoma Narrows Bridge p= Suspension bridge Washingtonp= p= Geneva Montanap= p= Ουρά []=Tacoma Narrow BridgeWilliam Howard Taftp= Tacoma Narrows Bridge p= Suspension bridge Washingtonp= p= Geneva Montanap= p= Ουρά []=William Howard Taftp= Tacoma Narrows Bridge p= Suspension bridge Washingtonp=Tacoma Narrows Bridge p= Geneva Montanap= p= Ουρά []= Suspension bridgeWilliam Howard Taftp= 37
  38. 38. Tacoma Narrows Bridge p= Suspension bridge Washingtonp=Tacoma Narrows Bridge p=Tacoma Narrows Bridge Geneva Montanap= p= Ουρά []= Suspension bridgeWilliam Howard Taft Washingtonp= Tacoma Narrows Bridge p= Suspension bridge Washingtonp=Tacoma Narrows Bridge p=Tacoma Narrows Bridge Geneva Montanap= p= Ουρά []= Suspension bridgeWilliam Howard Taft Washingtonp= Tacoma Narrows Bridge p= Suspension bridge Washingtonp=Tacoma Narrows Bridge p=Tacoma Narrows Bridge Geneva Montanap=Suspension bridge p= Ουρά []= WashingtonWilliam Howard Taft Genevap= 38
  39. 39. Tacoma Narrows Bridge p= Suspension bridge Washingtonp=Tacoma Narrows Bridge p=Tacoma Narrows Bridge Geneva Montanap=Suspension bridge p=Washington Ουρά []= GenevaWilliam Howard Taft Montanap= Tacoma Narrows Bridge p= Suspension bridge Washingtonp=Tacoma Narrows Bridge p=Tacoma Narrows Bridge Geneva Montanap=Suspension bridge p=Washington Ουρά []=MontanaWilliam Howard Taftp= Tacoma Narrows Bridge p= Suspension bridge Washingtonp=Tacoma Narrows Bridge p=Tacoma Narrows Bridge Geneva Montanap=Suspension bridge p=Washington Ουρά []=MontanaWilliam Howard Taftp=Geneva 39
  40. 40. static bool breadthFirstSearchFor(NodePtr from, NodePtr to, size_t n) { std::queue<NodePtr> q; from->setColor(Node::Gray); q.push(from); while (!q.empty()) { NodePtr u = q.front(); q.pop(); const Edges edges = u->getEdges(); for (Edges::const_iterator j = edges.begin(); j != edges.end(); j++) if ((*j)->getColor() == Node::White) { (*j)->setColor(Node::Gray); (*j)->setPredecessor(u); if (*j == to) return true; // Found q.push(*j); } u->setColor(Node::Black); } return false; // Not found} Κορυφές Διαδρομή αρχής, τέλους Κατασκευή BFS γράφου Δομή Λίστα ακμών δεδομένων γράφου Δομή δεδομένων γράφου 40
  41. 41. 41
  42. 42. Δομή δεδομένων γράφου#include <string>#include <iostream>#include <queue>#include <list>#include <functional>#include <boost/interprocess/managed_mapped_file.hpp>#include <boost/interprocess/offset_ptr.hpp>#include <boost/interprocess/allocators/allocator.hpp>#include <boost/unordered_set.hpp>#include <boost/interprocess/containers/string.hpp>#include <boost/interprocess/containers/slist.hpp>#include <boost/filesystem.hpp>#include <boost/filesystem/operations.hpp> 42
  43. 43. #include <string>#include <iostream>#include <queue>#include <list>#include <functional>#include <boost/interprocess/managed_mapped_file.hpp>#include <boost/interprocess/offset_ptr.hpp>#include <boost/interprocess/allocators/allocator.hpp>#include <boost/unordered_set.hpp>#include <boost/interprocess/containers/string.hpp>#include <boost/interprocess/containers/slist.hpp>#include <boost/filesystem.hpp>#include <boost/filesystem/operations.hpp>typedef managed_mapped_file::segment_manager SegmentManager;typedef allocator<char, SegmentManager> CharAllocator;typedef basic_string<char, std::char_traits<char>, CharAllocator> CharString;typedef allocator<Node, SegmentManager> NodeAllocator;typedef boost::unordered_set<Node, boost::hash<Node>, NodeEqual, NodeAllocator> Nodes;typedef offset_ptr<Node> NodePtr;typedef allocator<NodePtr, SegmentManager> NodePtrAllocator;typedef slist<NodePtr, NodePtrAllocator> Edges;typedef allocator<void, SegmentManager> VoidAllocator;typedef allocator<Edges, SegmentManager> EdgesAllocator;// A graph node, suitable for performing a breadh-first searchclass Node { public: typedef enum {White, Gray, Black} Color; private: CharString name; // Node name Color color; // Color used during BFS NodePtr predecessor; // BFS predecessor Edges edges; // Nodes edges public: // Since VoidAllocator is convertible to any other // allocator<T>, we can simplify the initialization // taking just one allocator for all inner containers. Node(const std::string &n, const VoidAllocator &voidAlloc) : name(n.begin(), n.end(), voidAlloc), color(White), predecessor(NULL), edges(voidAlloc) {} void addEdge(NodePtr p) { edges.push_front(p); }}; 43
  44. 44. /* * Read ^A-separated nodes from the inputFile, storing the graph * structure in the specified backingFile. */static void readData(const char *backingFile, const char *inputFile) { std::ifstream in(inputFile, std::ios::binary); if (in.fail()) { perror(inputFile); exit(1); } boost::filesystem::remove_all(backingFile); managed_mapped_file segment(create_only, backingFile, FileSize); // An allocator convertible to any allocator<T, SegmentManager> type VoidAllocator allocInst (segment.get_segment_manager()); // Construct the memory map and fill it Nodes *entries = segment.construct<Nodes>("entries")(Elements, boost::hash<Node>(), NodeEqual(), allocInst); std::string line; Node n(std::string(), allocInst); // To save construction costs/* * Search and report the shortest graph path from "from" to "to" * The graph is stored in backingFile. */static void searchData(const char *backingFile, const std::string &from, const std::string &to) { managed_mapped_file segment(open_copy_on_write, backingFile); // An allocator convertible to any allocator<T, SegmentManager> VoidAllocator allocInst(segment.get_segment_manager()); // Obtain the previously saved entries Nodes *entries = segment.find<Nodes>("entries").first; NodePtr toPtr; bool found = breadthFirstSearchFor( findNode(entries, Node(from, allocInst)), toPtr = findNode(entries, Node(to, allocInst)), entries->size()); 44
  45. 45. Κορυφές Διαδρομή αρχής, τέλουςΚατασκευή BFS γράφου ΔομήΛίστα ακμών δεδομένων γράφου 45
  46. 46. $ ./smap -r graph.bin graph.txt$ ./smap -s graph.bin Tacoma Narrows BridgeWilliam howard taft0% 10 20 30 40 50 60 70 80 90 100%|----|----|----|----|----|----|----|----|----|----|*Tacoma Narrows BridgeWashingtonMontanaWilliam howard taft$ ./smap -s graph.bin Tacoma Narrows Bridge24-hour analog dial0% 10 20 30 40 50 60 70 80 90 100%|----|----|----|----|----|----|----|----|----|----|**Tacoma Narrows BridgeSuspension bridgeGenevaWatch24-hour analog dial 46
  47. 47. $ ./smap -s graph.bin Tacoma Narrows Bridge Wet t-shirtcontest0% 10 20 30 40 50 60 70 80 90 100%|----|----|----|----|----|----|----|----|----|----|*Tacoma Narrows BridgeWashingtonStarbucksToplessnessWet t-shirt contest The problem with wikipedia 47
  48. 48. Performance Κατασκευή δομής δεδομένων 18:00Χρόνος (ω:λ) 12:00 06:00 00:00 MySQL mmap Server 15:59:43 Client system 03:16:59 00:04:32 Client user 00:52:48 00:04:52 Taft: Κρύα κρυφή μνήμη 4.500Χρόνος / κόμβο (μs) 4.000 3.500 3.000 2.500 2.000 1.500 1.000 500 0 MySQL mmap Waiting 348 3.886 Server 259 Client system 58 19 Client user 16 2 48
  49. 49. Taft: Ζεστή κρυφή μνήμη 450Χρόνος / κόμβο (μs) 400 350 300 250 200 150 100 50 0 MySQL mmap Waiting 23 0 Server 305 Client system 59 5 Client user 15 3 24h Clock: Κρύα κρυφή μνήμη 2.500Χρόνος / κόμβο (μs) 2.000 1.500 1.000 500 0 MySQL mmap Waiting 415 1.977 Server 472 Client system 103 10 Client user 26 4 24h Clock: Ζεστή κρυφή μνήμη 800Χρόνος / κόμβο (μs) 700 600 500 400 300 200 100 0 MySQL mmap Waiting 120 0 Server 469 Client system 103 3 Client user 27 4 49
  50. 50. Κλιμάκωση απόδοσης (κρύα μνήμη)Χρόνος (ρ) / κόμβο (ms) 5 mmap MySQL 4 3 2 1 0 0 2000 4000 6000 8000 Χιλιάδες Αριθμός κόμβων 50
  51. 51. ACIDA 51
  52. 52. CID 52
  53. 53. SQL 53
  54. 54. A case… Application code vector<Customer> customers1; Customer c1(d1,cd1,s1,p1); customers1.push_back(c1); … vector<Truck> trucks; Truck t1(cs1,dc1,pc1,rlp1, customers1); trucks.push_back(t1); ….ODBCJDBC 54
  55. 55. register L1 D cache L2 cache L3 cache DRAM HDD cache HDD / SSD 55
  56. 56. 534,681,000 εντολές ΚΜΕ 100,000Μέγιστη διεκπεραιωτικότητα (MB/s ) 10,000 1,000 100 10 1 L1 D cache L2 cache DDR RAM Hard diskΧείριστη αναμονή (λογ. κλιμ.) L1 D cache L2 cache DDR RAM Hard disk 1.3 ns 9.7 ns 28.5 ns 25.6 ms 56
  57. 57. Χείριστη αναμονή (λογ. κλιμ.) L1 D cache L2 cache DDR RAM Hard disk 1.3 ns 9.7 ns 28.5 ns 25.6 ms 57
  58. 58. // Write the given node to the specified file, returning its file offset.// On return the files offset is set to the first free byte.static longwrite_node(struct pnode *p, FILE *f){ long my_offset = ftell(f); size_t ret; if (p->one) { struct pnode_disk_one pdo; size_t dlen = datalen(p->end); long len = sizeof(pdo) + dlen; fseek(f, len, SEEK_CUR); pdo.h.type = dt_one; pdo.h.is_terminal = p->is_terminal; pdo.h.has_zero = (p->zero != NULL); pdo.h.has_one = true; pdo.h.begin = p->begin; pdo.h.end = p->end; if (p->zero) write_node(p->zero, f); pdo.one = write_node(p->one, f); long saved_offset = ftell(f); fseek(f, my_offset, SEEK_SET); fwrite(&pdo, 1, sizeof(pdo), f); fwrite(p->data, 1, dlen, f); fseek(f, saved_offset, SEEK_SET); return my_offset; } else { 58
  59. 59. #include <boost/interprocess/managed_mapped_file.hpp>#include <boost/interprocess/offset_ptr.hpp>#include <boost/interprocess/allocators/allocator.hpp>#include <boost/unordered_set.hpp>#include <boost/interprocess/containers/string.hpp>#include <boost/interprocess/containers/slist.hpp> βήμα 1 βήμα Ν w r/ο φυσική μνήμη διεργασία 1 διεργασία 2 read r/w αντίγραφο φυσική μνήμη 59
  60. 60. www.spinellis.grtwitter.com/CoolSWEng dds@aueb.gr 60
  61. 61. www.spinellis.gr/wplwww.spinellis.gr/blog/20101030/smap.cpp 61

×