trinker / qdapRegex

qdapRegex is a collection of regular expression tools associated with the qdap package that may be useful outside of the context of discourse analysis.
49 stars 4 forks source link

add state abbreviations #23

Open trinker opened 8 years ago

trinker commented 8 years ago
#https://gist.github.com/nerdsrescueme/1237767
pat <- '\\b(?:A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY])*\\b'
ex_states <- ex_(pattern=pat)
txt <- "In alphabetical order by postal abbreviation (not by state):

State Abbreviation  State Name
AK  Alaska
AL  Alabama
AR  Arkansas
AZ  Arizona
CA  California
CO  Colorado
CT  Connecticut
DE  Delaware
FL  Florida
GA  Georgia
HI  Hawaii
IA  Iowa
ID  Idaho
IL  Illinois
IN  Indiana
KS  Kansas
KY  Kentucky
LA  Louisiana
MA  Massachusetts
MD  Maryland
ME  Maine
MI  Michigan
MN  Minnesota
MO  Missouri
MS  Mississippi
MT  Montana
NC  North Carolina
ND  North Dakota
NE  Nebraska
NH  New Hampshire
NJ  New Jersey
NM  New Mexico
NV  Nevada
NY  New York
OH  Ohio
OK  Oklahoma
OR  Oregon
PA  Pennsylvania
RI  Rhode Island
SC  South Carolina
SD  South Dakota
TN  Tennessee
TX  Texas
UT  Utah
VA  Virginia
VT  Vermont
WA  Washington
WI  Wisconsin
WV  West Virginia
WY  Wyoming
Other codes[change | change source]
Federal districts[change | change source]
Abbreviation    Location
DC  District of Columbia
Territories[change | change source]
Abbreviation    Location
AS  American Samoa
GU  Guam
MP  Northern Mariana Islands
PR  Puerto Rico
VI  U.S. Virgin Islands
UM  United States Minor Outlying Islands
"

txt <- gsub("([A-Z]{2})([A-Z][a-z])", "\\1 \\2", txt)
ex_states(txt)
trinker commented 8 years ago
#http://www.regexlib.com/UserPatterns.aspx?authorId=dd782f8f-82a2-45e6-8e77-00a9647e1aac
#jared cook
pat <- '(-?[1-9](\\.\\d+)?)((\\s?[X*]\\s?10[E^]([+-]?\\d+))|(E([+-]?\\d+)))'
ex_scientific_notation <- ex_(pattern=pat)

txt <- "negative numbers. Make sure to use a case insensitive pattern matcher. Group 1 is the significand, and either group 5 or group 7 is the exponent (one or the other will be null, but never both).
Matches 
1.1 x 10^9 | 2.34 X 10^12 | 3.14159 * 10^30 | 1.1x10^9 | 2.34X10^12 | 3.14159*10^30 | 1.1 x 10e9 | 2.34 x 10E12 | 3.14159e30 | 1.1 x 10^-9 | 2.34 X 10^-12 | 3.14159E-30 | -1.1 x 10^9 | -2.34 X 10E12 | -3.14159 * 10e30 | -1.1x10^-9 | -2.34E-12 | -3.14159e-30 | 3.1459E+030 | 1x10^9 | 1E9
Non-Matches 
0.1 x 10^"

ex_scientific_notation(txt)
trinker commented 8 years ago
#http://www.regexlib.com/REDetails.aspx?regexp_id=2288
#Art Araya
pat <- '(?:(?:[123]|I{1,3})\\s*)?(?:[A-Z][a-zA-Z]+|Song of Songs|Song of Solomon).?\\s*(?:1?[0-9]?[0-9]):\\s*\\d{1,3}(?:[,-]\\s*\\d{1,3})*(?:;\\s*(?:(?:[123]|I{1,3})\\s*)?(?:[A-Z][a-zA-Z]+|Song of Songs|Song of Solomon)?.?\\s*(?:1?[0-9]?[0-9]):\\s*\\d{1,3}(?:[,-]\\s*\\d{1,3})*)*'
ex_bible_verse <- ex_(pattern=pat)

txt <- "negative numbers. Make sure to use a case 2 Kings 2:11 insensitive pattern matll, but never both). Leviticus 3:3-4 | II Ki. 2:11; 3:12-22, 25 | 2Cor 3:16; Rom. 12:1-5,7,9
Non-Matches 
Lev chap 3 vv3-4 | 2nd Kings 2:11 | Romans 12"
ex_bible_verse(txt)