Skip to content

Latest commit

 

History

History
61 lines (49 loc) · 1.77 KB

parser.md

File metadata and controls

61 lines (49 loc) · 1.77 KB

Parser

Used this to parse large database dump to extract relevant information.

  • Extracts data from this pattern <url> data </url> and saves it.
from re import compile, findall

def parse(fileName):
 exp = compile(r"<url>(.*?)</url>")
 infile = open(fileName, 'r', encoding="utf-8")
 text = infile.read().lower()  # Notice, no .split()
 text_exclusive = ''.join([''.join(block + "\n") for block in findall(exp, text)])
    with open('output','w',encoding = 'utf-8') as f:
 f.write(text_exclusive)

parse("testData")
  • Converts list of URLs into link in markdown file for visualization.
def format(pathName):
 file1 = open(pathName, 'r')
 Lines = file1.readlines()
 output = ''.join([''.join("![]({})\n".format(line.strip())) for line in Lines ])
    with open('output','w',encoding = 'utf-8') as f:
 f.write(output)

format("Data.txt")
  • Structures a tree from the folder of markdown files to use as a summary
import os
from os.path import basename

def list_files(startpath):
 currentBasePath = ""
    for path, subdirs, files in os.walk(startpath):
        for name in files:
 filePath = path.replace(startpath, '')
            if filePath.startswith("/.git"):
                continue
 level = path.replace(startpath, '').count(os.sep)
            # indent = ' ' * 4 * (level)
 basePath = os.path.basename(path)
            if basePath.startswith("Screen"):
                continue
            if currentBasePath != basePath:
 currentBasePath = basePath
                print('- #### {}'.format(os.path.basename(path)))
            # subindent = ' ' * 4 * (level + 1)
 pathName = os.path.basename(path).replace(" ", "%20")
 fileName = name.replace(" ", "%20")
            print('{}- [{}](./{}/{})'.format("    ", name[:-3], pathName, fileName))

list_files("Notebook")